diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml index 769fdc1..d2417be 100644 --- a/.github/workflows/release.yml +++ b/.github/workflows/release.yml @@ -17,6 +17,8 @@ jobs: { name: 'llm', features: '--no-default-features --features llm' }, { name: 'fetch', features: '--no-default-features --features fetch' }, { name: 'crypto', features: '--no-default-features --features crypto' }, + { name: 'crawl', features: '--no-default-features --features crawl' }, + { name: 'wasip1', features: '--no-default-features --features wasip1' }, { name: 'full', features: '--all-features' } ] diff --git a/Cargo.lock b/Cargo.lock index 1d51865..99ae506 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -47,7 +47,7 @@ version = "0.69.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "271383c67ccabffb7381723dea0672a673f292304fcb45c01cc648c7a8d58088" dependencies = [ - "bitflags", + "bitflags 2.8.0", "cexpr", "clang-sys", "itertools", @@ -64,6 +64,12 @@ dependencies = [ "which", ] +[[package]] +name = "bitflags" +version = "1.3.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bef38d45163c2f1dde094a7dfd33ccf595c92905c8f8f4fdc18d06fb1037718a" + [[package]] name = "bitflags" version = "2.8.0" @@ -72,12 +78,17 @@ checksum = "8f68f53c83ab957f72c32642f3868eec03eb974d1fb82e453128456482613d36" [[package]] name = "blockless-sdk" -version = "0.1.10" +version = "0.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "25f2739adfd37df55c6ad467ffd3e41d9d2ef1f30a1d5f9f2bb39bf4e736cd70" +checksum = "38f0d963f7f2a727ba556c12b30e0acd0bd4cfbd88c5e36efc5d613e4de40fae" dependencies = [ + "htmd", "json", + "kuchikiki", + "regex", "serde", + "serde_json", + "url", ] [[package]] @@ -127,6 +138,12 @@ dependencies = [ "libloading", ] +[[package]] +name = "convert_case" +version = "0.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6245d59a3e82a7fc217c5828a6692dbc6dfb63a0c8c90495621f7b9d79704a0e" + [[package]] name = "convert_case" version = "0.6.0" @@ -136,6 +153,72 @@ dependencies = [ "unicode-segmentation", ] +[[package]] +name = "cssparser" +version = "0.27.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "754b69d351cdc2d8ee09ae203db831e005560fc6030da058f86ad60c92a9cb0a" +dependencies = [ + "cssparser-macros", + "dtoa-short", + "itoa 0.4.8", + "matches", + "phf 0.8.0", + "proc-macro2", + "quote", + "smallvec", + "syn 1.0.109", +] + +[[package]] +name = "cssparser-macros" +version = "0.6.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "13b588ba4ac1a99f7f2964d24b3d896ddc6bf847ee3855dbd4366f058cfcd331" +dependencies = [ + "quote", + "syn 2.0.98", +] + +[[package]] +name = "derive_more" +version = "0.99.20" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6edb4b64a43d977b8e99788fe3a04d483834fba1215a7e02caa415b626497f7f" +dependencies = [ + "convert_case 0.4.0", + "proc-macro2", + "quote", + "rustc_version", + "syn 2.0.98", +] + +[[package]] +name = "displaydoc" +version = "0.2.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "97369cbbc041bc366949bc74d34658d6cda5621039731c6310521892a3a20ae0" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.98", +] + +[[package]] +name = "dtoa" +version = "1.0.10" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d6add3b8cff394282be81f3fc1a0605db594ed69890078ca6e2cab1c408bcf04" + +[[package]] +name = "dtoa-short" +version = "0.3.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cd1511a7b6a56299bd043a9c167a6d2bfb37bf84a6dfceaba651168adfb43c87" +dependencies = [ + "dtoa", +] + [[package]] name = "either" version = "1.13.0" @@ -189,6 +272,45 @@ version = "1.0.7" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "3f9eec918d3f24069decb9af1554cad7c880e2da24a9afd88aca000531ab82c1" +[[package]] +name = "form_urlencoded" +version = "1.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e13624c2627564efccf4934284bdd98cbaa14e79b0b5a141218e507b3a823456" +dependencies = [ + "percent-encoding", +] + +[[package]] +name = "futf" +version = "0.1.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "df420e2e84819663797d1ec6544b13c5be84629e7bb00dc960d6917db2987843" +dependencies = [ + "mac", + "new_debug_unreachable", +] + +[[package]] +name = "fxhash" +version = "0.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c31b6d751ae2c7f11320402d34e41349dd1016f8d5d45e48c4312bc8625af50c" +dependencies = [ + "byteorder", +] + +[[package]] +name = "getrandom" +version = "0.1.16" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8fc3cb4d91f53b50155bdcfd23f6a4c39ae1969c2ae85982b135750cccaf5fce" +dependencies = [ + "cfg-if", + "libc", + "wasi 0.9.0+wasi-snapshot-preview1", +] + [[package]] name = "getrandom" version = "0.2.15" @@ -198,7 +320,7 @@ dependencies = [ "cfg-if", "js-sys", "libc", - "wasi", + "wasi 0.11.0+wasi-snapshot-preview1", "wasm-bindgen", ] @@ -218,6 +340,12 @@ dependencies = [ "serde", ] +[[package]] +name = "hashbrown" +version = "0.12.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8a9ee70c43aaf417c914396645a0fa852624801b24ebb7ae78fe8272889ac888" + [[package]] name = "hashbrown" version = "0.14.5" @@ -243,12 +371,175 @@ dependencies = [ "windows-sys", ] +[[package]] +name = "htmd" +version = "0.2.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "be5de9d3e28558a12296e07d6cd7789c879417fbb3f07bdc77f1dd1df74ccbd8" +dependencies = [ + "html-escape", + "html5ever 0.31.0", + "markup5ever_rcdom", +] + +[[package]] +name = "html-escape" +version = "0.2.13" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6d1ad449764d627e22bfd7cd5e8868264fc9236e07c752972b4080cd351cb476" +dependencies = [ + "utf8-width", +] + +[[package]] +name = "html5ever" +version = "0.26.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bea68cab48b8459f17cf1c944c67ddc572d272d9f2b274140f223ecb1da4a3b7" +dependencies = [ + "log", + "mac", + "markup5ever 0.11.0", + "proc-macro2", + "quote", + "syn 1.0.109", +] + +[[package]] +name = "html5ever" +version = "0.31.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "953cbbe631aae7fc0a112702ad5d3aaf09da38beaf45ea84610d6e1c358f569c" +dependencies = [ + "log", + "mac", + "markup5ever 0.16.2", + "match_token", +] + +[[package]] +name = "icu_collections" +version = "2.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "200072f5d0e3614556f94a9930d5dc3e0662a652823904c3a75dc3b0af7fee47" +dependencies = [ + "displaydoc", + "potential_utf", + "yoke", + "zerofrom", + "zerovec", +] + +[[package]] +name = "icu_locale_core" +version = "2.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0cde2700ccaed3872079a65fb1a78f6c0a36c91570f28755dda67bc8f7d9f00a" +dependencies = [ + "displaydoc", + "litemap", + "tinystr", + "writeable", + "zerovec", +] + +[[package]] +name = "icu_normalizer" +version = "2.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "436880e8e18df4d7bbc06d58432329d6458cc84531f7ac5f024e93deadb37979" +dependencies = [ + "displaydoc", + "icu_collections", + "icu_normalizer_data", + "icu_properties", + "icu_provider", + "smallvec", + "zerovec", +] + +[[package]] +name = "icu_normalizer_data" +version = "2.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "00210d6893afc98edb752b664b8890f0ef174c8adbb8d0be9710fa66fbbf72d3" + +[[package]] +name = "icu_properties" +version = "2.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "016c619c1eeb94efb86809b015c58f479963de65bdb6253345c1a1276f22e32b" +dependencies = [ + "displaydoc", + "icu_collections", + "icu_locale_core", + "icu_properties_data", + "icu_provider", + "potential_utf", + "zerotrie", + "zerovec", +] + +[[package]] +name = "icu_properties_data" +version = "2.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "298459143998310acd25ffe6810ed544932242d3f07083eee1084d83a71bd632" + +[[package]] +name = "icu_provider" +version = "2.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "03c80da27b5f4187909049ee2d72f276f0d9f99a42c306bd0131ecfe04d8e5af" +dependencies = [ + "displaydoc", + "icu_locale_core", + "stable_deref_trait", + "tinystr", + "writeable", + "yoke", + "zerofrom", + "zerotrie", + "zerovec", +] + [[package]] name = "ident_case" version = "1.0.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b9e0384b61958566e926dc50660321d12159025e767c18e043daf26b70104c39" +[[package]] +name = "idna" +version = "1.0.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "686f825264d630750a544639377bae737628043f20d38bbc029e8f29ea968a7e" +dependencies = [ + "idna_adapter", + "smallvec", + "utf8_iter", +] + +[[package]] +name = "idna_adapter" +version = "1.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3acae9609540aa318d1bc588455225fb2085b9ed0c4f6bd0d9d5bcd86f1a0344" +dependencies = [ + "icu_normalizer", + "icu_properties", +] + +[[package]] +name = "indexmap" +version = "1.9.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bd070e393353796e801d209ad339e89596eb4c8d430d18ede6a1cced8fafbd99" +dependencies = [ + "autocfg", + "hashbrown 0.12.3", +] + [[package]] name = "indexmap" version = "2.7.1" @@ -268,6 +559,12 @@ dependencies = [ "either", ] +[[package]] +name = "itoa" +version = "0.4.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b71991ff56294aa922b450139ee08b3bfc70982c6b2c7562771375cf73542dd4" + [[package]] name = "itoa" version = "1.0.14" @@ -281,7 +578,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "10119b9ea70e813d800b3a3a734ec91ae1c2cdf9846c52f8a7a426ea1331ce5c" dependencies = [ "anyhow", - "bitflags", + "bitflags 2.8.0", "fastrand", "quickcheck", "rquickjs", @@ -295,12 +592,12 @@ dependencies = [ [[package]] name = "javy-bless-plugins" -version = "0.2.3" +version = "0.2.4" dependencies = [ "anyhow", "blockless-sdk", "javy-plugin-api", - "rand", + "rand 0.8.5", "serde", "serde_json", "thiserror", @@ -332,6 +629,19 @@ version = "0.12.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "078e285eafdfb6c4b434e0d31e8cfcb5115b651496faca5749b88fafd4f23bfd" +[[package]] +name = "kuchikiki" +version = "0.8.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f29e4755b7b995046f510a7520c42b2fed58b77bd94d5a87a8eb43d2fd126da8" +dependencies = [ + "cssparser", + "html5ever 0.26.0", + "indexmap 1.9.3", + "matches", + "selectors", +] + [[package]] name = "lazy_static" version = "1.5.0" @@ -366,12 +676,88 @@ version = "0.4.15" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d26c52dbd32dccf2d10cac7725f8eae5296885fb5703b261f7d0a0739ec807ab" +[[package]] +name = "litemap" +version = "0.8.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "241eaef5fd12c88705a01fc1066c48c4b36e0dd4377dcdc7ec3942cea7a69956" + +[[package]] +name = "lock_api" +version = "0.4.13" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "96936507f153605bddfcda068dd804796c84324ed2510809e5b2a624c81da765" +dependencies = [ + "autocfg", + "scopeguard", +] + [[package]] name = "log" version = "0.4.25" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "04cbf5b083de1c7e0222a7a51dbfdba1cbe1c6ab0b15e29fff3f6c077fd9cd9f" +[[package]] +name = "mac" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c41e0c4fef86961ac6d6f8a82609f55f31b05e4fce149ac5710e439df7619ba4" + +[[package]] +name = "markup5ever" +version = "0.11.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7a2629bb1404f3d34c2e921f21fd34ba00b206124c81f65c50b43b6aaefeb016" +dependencies = [ + "log", + "phf 0.10.1", + "phf_codegen 0.10.0", + "string_cache", + "string_cache_codegen", + "tendril", +] + +[[package]] +name = "markup5ever" +version = "0.16.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2e4cd8c02f18a011991a039855480c64d74291c5792fcc160d55d77dc4de4a39" +dependencies = [ + "log", + "tendril", + "web_atoms", +] + +[[package]] +name = "markup5ever_rcdom" +version = "0.5.3-unofficial" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "853740b93240b82f68a23d8b296b2d19fc81521c298fcae44bf34bed6e445f00" +dependencies = [ + "html5ever 0.31.0", + "markup5ever 0.16.2", + "tendril", + "xml5ever", +] + +[[package]] +name = "match_token" +version = "0.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "88a9689d8d44bf9964484516275f5cd4c9b59457a6940c1d5d0ecbb94510a36b" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.98", +] + +[[package]] +name = "matches" +version = "0.1.10" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2532096657941c2fea9c289d370a250971c689d4f143798ff67113ec042024a5" + [[package]] name = "memchr" version = "2.7.4" @@ -384,6 +770,18 @@ version = "0.2.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "68354c5c6bd36d73ff3feceb05efa59b6acb7626617f4962be322a825e61f79a" +[[package]] +name = "new_debug_unreachable" +version = "1.0.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "650eef8c711430f1a879fdd01d4745a7deea475becfb90269c06775983bbf086" + +[[package]] +name = "nodrop" +version = "0.1.14" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "72ef4a56884ca558e5ddb05a1d1e7e1bfd9a68d9ed024c21704cc98872dae1bb" + [[package]] name = "nom" version = "7.1.3" @@ -409,6 +807,174 @@ version = "1.20.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1261fe7e33c73b354eab43b1273a57c8f967d0391e80353e51f764ac02cf6775" +[[package]] +name = "parking_lot" +version = "0.12.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "70d58bf43669b5795d1576d0641cfb6fbb2057bf629506267a92807158584a13" +dependencies = [ + "lock_api", + "parking_lot_core", +] + +[[package]] +name = "parking_lot_core" +version = "0.9.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bc838d2a56b5b1a6c25f55575dfc605fabb63bb2365f6c2353ef9159aa69e4a5" +dependencies = [ + "cfg-if", + "libc", + "redox_syscall", + "smallvec", + "windows-targets", +] + +[[package]] +name = "percent-encoding" +version = "2.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e3148f5046208a5d56bcfc03053e3ca6334e51da8dfb19b6cdc8b306fae3283e" + +[[package]] +name = "phf" +version = "0.8.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3dfb61232e34fcb633f43d12c58f83c1df82962dcdfa565a4e866ffc17dafe12" +dependencies = [ + "phf_macros", + "phf_shared 0.8.0", + "proc-macro-hack", +] + +[[package]] +name = "phf" +version = "0.10.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fabbf1ead8a5bcbc20f5f8b939ee3f5b0f6f281b6ad3468b84656b658b455259" +dependencies = [ + "phf_shared 0.10.0", +] + +[[package]] +name = "phf" +version = "0.11.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1fd6780a80ae0c52cc120a26a1a42c1ae51b247a253e4e06113d23d2c2edd078" +dependencies = [ + "phf_shared 0.11.3", +] + +[[package]] +name = "phf_codegen" +version = "0.8.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cbffee61585b0411840d3ece935cce9cb6321f01c45477d30066498cd5e1a815" +dependencies = [ + "phf_generator 0.8.0", + "phf_shared 0.8.0", +] + +[[package]] +name = "phf_codegen" +version = "0.10.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4fb1c3a8bc4dd4e5cfce29b44ffc14bedd2ee294559a294e2a4d4c9e9a6a13cd" +dependencies = [ + "phf_generator 0.10.0", + "phf_shared 0.10.0", +] + +[[package]] +name = "phf_codegen" +version = "0.11.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "aef8048c789fa5e851558d709946d6d79a8ff88c0440c587967f8e94bfb1216a" +dependencies = [ + "phf_generator 0.11.3", + "phf_shared 0.11.3", +] + +[[package]] +name = "phf_generator" +version = "0.8.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "17367f0cc86f2d25802b2c26ee58a7b23faeccf78a396094c13dced0d0182526" +dependencies = [ + "phf_shared 0.8.0", + "rand 0.7.3", +] + +[[package]] +name = "phf_generator" +version = "0.10.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5d5285893bb5eb82e6aaf5d59ee909a06a16737a8970984dd7746ba9283498d6" +dependencies = [ + "phf_shared 0.10.0", + "rand 0.8.5", +] + +[[package]] +name = "phf_generator" +version = "0.11.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3c80231409c20246a13fddb31776fb942c38553c51e871f8cbd687a4cfb5843d" +dependencies = [ + "phf_shared 0.11.3", + "rand 0.8.5", +] + +[[package]] +name = "phf_macros" +version = "0.8.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7f6fde18ff429ffc8fe78e2bf7f8b7a5a5a6e2a8b58bc5a9ac69198bbda9189c" +dependencies = [ + "phf_generator 0.8.0", + "phf_shared 0.8.0", + "proc-macro-hack", + "proc-macro2", + "quote", + "syn 1.0.109", +] + +[[package]] +name = "phf_shared" +version = "0.8.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c00cf8b9eafe68dde5e9eaa2cef8ee84a9336a47d566ec55ca16589633b65af7" +dependencies = [ + "siphasher 0.3.11", +] + +[[package]] +name = "phf_shared" +version = "0.10.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b6796ad771acdc0123d2a88dc428b5e38ef24456743ddb1744ed628f9815c096" +dependencies = [ + "siphasher 0.3.11", +] + +[[package]] +name = "phf_shared" +version = "0.11.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "67eabc2ef2a60eb7faa00097bd1ffdb5bd28e62bf39990626a582201b7a754e5" +dependencies = [ + "siphasher 1.0.1", +] + +[[package]] +name = "potential_utf" +version = "0.1.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e5a7c30837279ca13e7c867e9e40053bc68740f988cb07f7ca6df43cc734b585" +dependencies = [ + "zerovec", +] + [[package]] name = "ppv-lite86" version = "0.2.20" @@ -418,6 +984,12 @@ dependencies = [ "zerocopy", ] +[[package]] +name = "precomputed-hash" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "925383efa346730478fb4838dbe9137d2a47675ad789c546d150a6e1dd4ab31c" + [[package]] name = "prettyplease" version = "0.2.29" @@ -462,6 +1034,12 @@ dependencies = [ "version_check", ] +[[package]] +name = "proc-macro-hack" +version = "0.5.20+deprecated" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "dc375e1527247fe1a97d8b7156678dfe7c1af2fc075c9a4db3690ecd2a148068" + [[package]] name = "proc-macro2" version = "1.0.93" @@ -479,7 +1057,7 @@ checksum = "588f6378e4dd99458b60ec275b4477add41ce4fa9f64dcba6f15adccb19b50d6" dependencies = [ "env_logger", "log", - "rand", + "rand 0.8.5", ] [[package]] @@ -491,6 +1069,20 @@ dependencies = [ "proc-macro2", ] +[[package]] +name = "rand" +version = "0.7.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6a6b1679d49b24bbfe0c803429aa1874472f50d9b363131f0e89fc356b544d03" +dependencies = [ + "getrandom 0.1.16", + "libc", + "rand_chacha 0.2.2", + "rand_core 0.5.1", + "rand_hc", + "rand_pcg", +] + [[package]] name = "rand" version = "0.8.5" @@ -498,8 +1090,18 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "34af8d1a0e25924bc5b7c43c079c942339d8f0a8b57c39049bef581b46327404" dependencies = [ "libc", - "rand_chacha", - "rand_core", + "rand_chacha 0.3.1", + "rand_core 0.6.4", +] + +[[package]] +name = "rand_chacha" +version = "0.2.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f4c8ed856279c9737206bf725bf36935d8666ead7aa69b52be55af369d193402" +dependencies = [ + "ppv-lite86", + "rand_core 0.5.1", ] [[package]] @@ -509,7 +1111,16 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e6c10a63a0fa32252be49d21e7709d4d4baf8d231c2dbce1eaa8141b9b127d88" dependencies = [ "ppv-lite86", - "rand_core", + "rand_core 0.6.4", +] + +[[package]] +name = "rand_core" +version = "0.5.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "90bde5296fc891b0cef12a6d03ddccc162ce7b2aff54160af9338f8d40df6d19" +dependencies = [ + "getrandom 0.1.16", ] [[package]] @@ -518,7 +1129,34 @@ version = "0.6.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ec0be4795e2f6a28069bec0b5ff3e2ac9bafc99e6a9a7dc3547996c5c816922c" dependencies = [ - "getrandom", + "getrandom 0.2.15", +] + +[[package]] +name = "rand_hc" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ca3129af7b92a17112d59ad498c6f81eaf463253766b90396d39ea7a39d6613c" +dependencies = [ + "rand_core 0.5.1", +] + +[[package]] +name = "rand_pcg" +version = "0.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "16abd0c1b639e9eb4d7c50c0b8100b0d0f849be2349829c740fe8e6eb4816429" +dependencies = [ + "rand_core 0.5.1", +] + +[[package]] +name = "redox_syscall" +version = "0.5.13" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0d04b7d0ee6b4a0207a0a7adb104d23ecb0b47d6beae7152d0fa34b692b29fd6" +dependencies = [ + "bitflags 2.8.0", ] [[package]] @@ -602,10 +1240,10 @@ version = "0.6.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "52decfafae40368b74e1513235245612680224924925736f386b9456b78efce9" dependencies = [ - "convert_case", + "convert_case 0.6.0", "fnv", "ident_case", - "indexmap", + "indexmap 2.7.1", "proc-macro-crate", "proc-macro-error", "proc-macro2", @@ -630,13 +1268,22 @@ version = "1.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "08d43f7aa6b08d49f382cde6a7982047c3426db949b1424bc4b7ec9ae12c6ce2" +[[package]] +name = "rustc_version" +version = "0.4.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cfcb3a22ef46e85b45de6ee7e79d063319ebb6594faafcf1c225ea92ab6e9b92" +dependencies = [ + "semver", +] + [[package]] name = "rustix" version = "0.38.44" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "fdb5bc1ae2baa591800df16c9ca78619bf65c0488b41b96ccec5d11220d8c154" dependencies = [ - "bitflags", + "bitflags 2.8.0", "errno", "libc", "linux-raw-sys", @@ -649,6 +1296,38 @@ version = "1.0.19" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "6ea1a2d0a644769cc99faa24c3ad26b379b786fe7c36fd3c546254801650e6dd" +[[package]] +name = "scopeguard" +version = "1.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "94143f37725109f92c262ed2cf5e59bce7498c01bcc1502d7b9afe439a4e9f49" + +[[package]] +name = "selectors" +version = "0.22.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "df320f1889ac4ba6bc0cdc9c9af7af4bd64bb927bccdf32d81140dc1f9be12fe" +dependencies = [ + "bitflags 1.3.2", + "cssparser", + "derive_more", + "fxhash", + "log", + "matches", + "phf 0.8.0", + "phf_codegen 0.8.0", + "precomputed-hash", + "servo_arc", + "smallvec", + "thin-slice", +] + +[[package]] +name = "semver" +version = "1.0.26" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "56e6fa9c48d24d85fb3de5ad847117517440f6beceb7798af16b4a87d616b8d0" + [[package]] name = "serde" version = "1.0.217" @@ -684,12 +1363,22 @@ version = "1.0.138" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d434192e7da787e94a6ea7e9670b26a036d0ca41e0b7efb2676dd32bae872949" dependencies = [ - "itoa", + "itoa 1.0.14", "memchr", "ryu", "serde", ] +[[package]] +name = "servo_arc" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d98238b800e0d1576d8b6e3de32827c2d74bee68bb97748dcf5071fb53965432" +dependencies = [ + "nodrop", + "stable_deref_trait", +] + [[package]] name = "shlex" version = "1.3.0" @@ -702,7 +1391,7 @@ version = "0.14.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "aa2bcf6c6e164e81bc7a5d49fc6988b3d515d9e8c07457d7b74ffb9324b9cd40" dependencies = [ - "getrandom", + "getrandom 0.2.15", "halfbrown", "ref-cast", "serde", @@ -717,6 +1406,55 @@ version = "0.1.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e3a9fe34e3e7a50316060351f37187a3f546bce95496156754b601a5fa71b76e" +[[package]] +name = "siphasher" +version = "0.3.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "38b58827f4464d87d377d175e90bf58eb00fd8716ff0a62f80356b5e61555d0d" + +[[package]] +name = "siphasher" +version = "1.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "56199f7ddabf13fe5074ce809e7d3f42b42ae711800501b5b16ea82ad029c39d" + +[[package]] +name = "smallvec" +version = "1.15.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "67b1b7a3b5fe4f1376887184045fcf45c69e92af734b7aaddc05fb777b6fbd03" + +[[package]] +name = "stable_deref_trait" +version = "1.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a8f112729512f8e442d81f95a8a7ddf2b7c6b8a1a6f509a95864142b30cab2d3" + +[[package]] +name = "string_cache" +version = "0.8.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bf776ba3fa74f83bf4b63c3dcbbf82173db2632ed8452cb2d891d33f459de70f" +dependencies = [ + "new_debug_unreachable", + "parking_lot", + "phf_shared 0.11.3", + "precomputed-hash", + "serde", +] + +[[package]] +name = "string_cache_codegen" +version = "0.5.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c711928715f1fe0fe509c53b43e993a9a557babc2d0a3567d0a3006f1ac931a0" +dependencies = [ + "phf_generator 0.11.3", + "phf_shared 0.11.3", + "proc-macro2", + "quote", +] + [[package]] name = "syn" version = "1.0.109" @@ -724,6 +1462,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "72b64191b275b66ffe2469e8af2c1cfe3bafa67b529ead792a6d0160888b4237" dependencies = [ "proc-macro2", + "quote", "unicode-ident", ] @@ -738,6 +1477,34 @@ dependencies = [ "unicode-ident", ] +[[package]] +name = "synstructure" +version = "0.13.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "728a70f3dbaf5bab7f0c4b1ac8d7ae5ea60a4b5549c8a5914361c99147a709d2" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.98", +] + +[[package]] +name = "tendril" +version = "0.4.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d24a120c5fc464a3458240ee02c299ebcb9d67b5249c8848b09d639dca8d7bb0" +dependencies = [ + "futf", + "mac", + "utf-8", +] + +[[package]] +name = "thin-slice" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8eaa81235c7058867fa8c0e7314f33dcce9c215f535d1913822a2b3f5e289f3c" + [[package]] name = "thiserror" version = "2.0.12" @@ -758,6 +1525,16 @@ dependencies = [ "syn 2.0.98", ] +[[package]] +name = "tinystr" +version = "0.8.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5d4f6d1145dcb577acf783d4e601bc1d76a13337bb54e6233add580b07344c8b" +dependencies = [ + "displaydoc", + "zerovec", +] + [[package]] name = "toml_datetime" version = "0.6.8" @@ -770,7 +1547,7 @@ version = "0.19.15" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1b5bb770da30e5cbfde35a2d7b9b8a2c4b8ef89548a7a6aeab5c9a576e3e7421" dependencies = [ - "indexmap", + "indexmap 2.7.1", "toml_datetime", "winnow", ] @@ -787,6 +1564,35 @@ version = "1.12.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f6ccf251212114b54433ec949fd6a7841275f9ada20dddd2f29e9ceea4501493" +[[package]] +name = "url" +version = "2.5.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "32f8b686cadd1473f4bd0117a5d28d36b1ade384ea9b5069a1c40aefed7fda60" +dependencies = [ + "form_urlencoded", + "idna", + "percent-encoding", +] + +[[package]] +name = "utf-8" +version = "0.7.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "09cc8ee72d2a9becf2f2febe0205bbed8fc6615b7cb429ad062dc7b7ddd036a9" + +[[package]] +name = "utf8-width" +version = "0.1.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "86bd8d4e895da8537e5315b8254664e6b769c4ff3db18321b297a1e7004392e3" + +[[package]] +name = "utf8_iter" +version = "1.0.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b6c140620e7ffbb22c2dee59cafe6084a59b5ffc27a8859a5f0d494b5d52b6be" + [[package]] name = "value-trait" version = "0.10.1" @@ -795,7 +1601,7 @@ checksum = "9170e001f458781e92711d2ad666110f153e4e50bfd5cbd02db6547625714187" dependencies = [ "float-cmp", "halfbrown", - "itoa", + "itoa 1.0.14", "ryu", ] @@ -805,6 +1611,12 @@ version = "0.9.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "0b928f33d975fc6ad9f86c8f283853ad26bdd5b10b7f1542aa2fa15e2289105a" +[[package]] +name = "wasi" +version = "0.9.0+wasi-snapshot-preview1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cccddf32554fecc6acb585f82a32a72e28b48f8c4c1883ddfeeeaa96f7d8e519" + [[package]] name = "wasi" version = "0.11.0+wasi-snapshot-preview1" @@ -868,6 +1680,18 @@ dependencies = [ "unicode-ident", ] +[[package]] +name = "web_atoms" +version = "0.1.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "57ffde1dc01240bdf9992e3205668b235e59421fd085e8a317ed98da0178d414" +dependencies = [ + "phf 0.11.3", + "phf_codegen 0.11.3", + "string_cache", + "string_cache_codegen", +] + [[package]] name = "which" version = "4.4.2" @@ -962,6 +1786,47 @@ dependencies = [ "memchr", ] +[[package]] +name = "writeable" +version = "0.6.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ea2f10b9bb0928dfb1b42b65e1f9e36f7f54dbdf08457afefb38afcdec4fa2bb" + +[[package]] +name = "xml5ever" +version = "0.22.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0a91563ba5a5ab749488164063f1317e327ca1daa80f00e5bd1e670ad0d78154" +dependencies = [ + "log", + "mac", + "markup5ever 0.16.2", +] + +[[package]] +name = "yoke" +version = "0.8.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5f41bb01b8226ef4bfd589436a297c53d118f65921786300e427be8d487695cc" +dependencies = [ + "serde", + "stable_deref_trait", + "yoke-derive", + "zerofrom", +] + +[[package]] +name = "yoke-derive" +version = "0.8.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "38da3c9736e16c5d3c8c597a9aaa5d1fa565d0532ae05e27c24aa62fb32c0ab6" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.98", + "synstructure", +] + [[package]] name = "zerocopy" version = "0.7.35" @@ -982,3 +1847,57 @@ dependencies = [ "quote", "syn 2.0.98", ] + +[[package]] +name = "zerofrom" +version = "0.1.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "50cc42e0333e05660c3587f3bf9d0478688e15d870fab3346451ce7f8c9fbea5" +dependencies = [ + "zerofrom-derive", +] + +[[package]] +name = "zerofrom-derive" +version = "0.1.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d71e5d6e06ab090c67b5e44993ec16b72dcbaabc526db883a360057678b48502" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.98", + "synstructure", +] + +[[package]] +name = "zerotrie" +version = "0.2.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "36f0bbd478583f79edad978b407914f61b2972f5af6fa089686016be8f9af595" +dependencies = [ + "displaydoc", + "yoke", + "zerofrom", +] + +[[package]] +name = "zerovec" +version = "0.11.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4a05eb080e015ba39cc9e23bbe5e7fb04d5fb040350f99f34e338d5fdd294428" +dependencies = [ + "yoke", + "zerofrom", + "zerovec-derive", +] + +[[package]] +name = "zerovec-derive" +version = "0.11.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5b96237efa0c878c64bd89c436f661be4e46b2f3eff1ebb976f7ef2321d2f58f" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.98", +] diff --git a/Cargo.toml b/Cargo.toml index 61e6951..768d989 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -15,7 +15,7 @@ crate-type = ["cdylib"] [dependencies] anyhow = "1.0.95" -blockless-sdk = { version = "0.1.10" } +blockless-sdk = { version = "0.2.0" } javy-plugin-api = { version = "3.0.0", features = ["json"] } rand = "0.8.5" serde_json = "1.0.120" @@ -30,5 +30,6 @@ opt-level = 3 crypto = [] fetch = [] llm = [] +crawl = [] wasip1 = [] -default = ["crypto", "fetch", "wasip1"] +default = ["crypto", "fetch", "crawl", "wasip1"] diff --git a/README.md b/README.md index 75f9012..034e54a 100644 --- a/README.md +++ b/README.md @@ -8,6 +8,7 @@ These are the plugins for the [Javy](https://github.com/blessnetwork/bls-javy) r |--------|-------------|--------------------------|--------------------------| | `BlessLLM` | A plugin for interacting with LLMs | ✅ | ✅ | | `BlessFetch` | A plugin for interacting with HTTP / fetch | ✅ | ✅ | +| `BlessCrawl` | A plugin for distributed web scraping | ✅ | ❌ | | `BlessCrypto` | A plugin for interacting with the crypto library | ✅ | ✅ | ## Architecture @@ -16,6 +17,7 @@ These are the plugins for the [Javy](https://github.com/blessnetwork/bls-javy) r flowchart TD subgraph "Rust Source Code" BC["BlessCrypto"]:::source + BCR["BlessCrawl"]:::source BF["BlessFetch"]:::source BL["BlessLLM"]:::source end @@ -41,6 +43,7 @@ flowchart TD %% Connections: Rust Modules to Build Pipeline BC -->|"compile"| CBP + BCR -->|"compile"| CBP BF -->|"compile"| CBP BL -->|"compile"| CBP @@ -61,6 +64,7 @@ flowchart TD %% Click Events click BC "https://github.com/blessnetwork/javy-bless-plugins/tree/main/src/crypto" + click BCR "https://github.com/blessnetwork/javy-bless-plugins/tree/main/src/crawl" click BF "https://github.com/blessnetwork/javy-bless-plugins/tree/main/src/fetch" click BL "https://github.com/blessnetwork/javy-bless-plugins/tree/main/src/llm" click EJS "https://github.com/blessnetwork/javy-bless-plugins/blob/main/examples/llm.js" @@ -92,5 +96,5 @@ cargo build --target=wasm32-wasip1 --release --all-features javy init-plugin ./target/wasm32-wasip1/release/bless_plugins.wasm -o bless_plugins.wasm # compile javascript to wasm with javy QuickJS runtime and plugin - to be executed in a WASM runtime -javy build -C plugin=bless_plugins.wasm ./examples/llm.js -o bless-llm.wasm +javy build -C plugin=bless_plugins.wasm ./examples/crawl.js -o bless-crawl.wasm ``` diff --git a/examples/crawl.js b/examples/crawl.js new file mode 100644 index 0000000..ac7d139 --- /dev/null +++ b/examples/crawl.js @@ -0,0 +1,34 @@ +// Example usage of BlessCrawl plugin + +async function main() { + const crawler = new BlessCrawl(); + try { + console.log("=== BlessCrawl Example ===\n"); + + console.log("1. Scraping webpage..."); + const scrapeResult = await crawler.scrape("https://example.com", { + format: "markdown", + }); + console.log(JSON.stringify(scrapeResult, null, 2)); + + console.log("2. Mapping links..."); + const mapResult = await crawler.map("https://example.com"); + console.log(JSON.stringify(mapResult, null, 2)); + + console.log("3. Crawling website..."); + const crawlResult = await crawler.crawl("https://example.com", { + max_depth: 2, // Only go 2 levels deep + limit: 10, // Maximum 10 pages + follow_external: false, // Don't follow external links + delay_between_requests: 1000, // 1 second delay between requests + parallel_requests: 2 // Max 2 parallel requests + }); + console.log(JSON.stringify(crawlResult, null, 2)); + } catch (error) { + console.error("Error:", error); + } +} + +main() + .then(() => console.log("\n=== Example completed ===")) + .catch((error) => console.error("Example failed:", error)); diff --git a/src/crawl/crawl.js b/src/crawl/crawl.js new file mode 100644 index 0000000..80858cd --- /dev/null +++ b/src/crawl/crawl.js @@ -0,0 +1,91 @@ +// Wrap everything in an anonymous function to avoid leaking local variables into the global scope. +(function () { + // Get a reference to the function before we delete it from `globalThis`. + const __BlessCrawl = globalThis.BlessCrawl; + + // BlessCrawl class wrapper + class BlessCrawl { + constructor(config = {}) { + // Create the underlying crawl instance through the Rust binding + this._instance = __BlessCrawl(config); + } + + /** + * Scrape webpage content and return as markdown with metadata + * @param {string} url - The URL to scrape + * @param {Object} options - Optional mapping options + * @returns {Promise} - Promise that resolves to scrape response + */ + async scrape(url, options = {}) { + if (typeof url !== 'string') { + throw new Error('URL must be a string'); + } + return new Promise((resolve, reject) => { + try { + const result = this._instance.scrape(url, options); + if (result.success) { + resolve(result.data); + } else { + reject(result.error); + } + } catch (error) { + reject(error); + } + }); + } + + /** + * Extract all links from a webpage, categorized by type + * @param {string} url - The URL to map + * @param {Object} options - Optional mapping options + * @returns {Promise} - Promise that resolves to map response + */ + async map(url, options = {}) { + if (typeof url !== 'string') { + throw new Error('URL must be a string'); + } + return new Promise((resolve, reject) => { + try { + const result = this._instance.map(url, options); + if (result.success) { + resolve(result.data); + } else { + reject(result.error); + } + } catch (error) { + reject(error); + } + }); + } + + /** + * Recursively crawl a website with configurable depth and filtering + * @param {string} url - The URL to start crawling from + * @param {Object} options - Optional crawl options + * @returns {Promise} - Promise that resolves to crawl response + */ + async crawl(url, options = {}) { + if (typeof url !== 'string') { + throw new Error('URL must be a string'); + } + return new Promise((resolve, reject) => { + try { + const result = this._instance.crawl(url, options); + if (result.success) { + resolve(result.data); + } else { + reject(result.error); + } + } catch (error) { + reject(error); + } + }); + } + } + + // Expose the BlessCrawl class globally + globalThis.BlessCrawl = BlessCrawl; + + // Delete the internal function from `globalThis` so it doesn't leak. + Reflect.deleteProperty(globalThis, "__BlessCrawl"); +})(); diff --git a/src/crawl/mod.rs b/src/crawl/mod.rs new file mode 100644 index 0000000..4b824ab --- /dev/null +++ b/src/crawl/mod.rs @@ -0,0 +1,282 @@ +use anyhow::{anyhow, Result}; +use blockless_sdk::{BlessCrawl, CrawlOptions, MapOptions, ScrapeOptions, Viewport}; +use javy_plugin_api::javy::{ + hold, hold_and_release, + quickjs::{prelude::MutFn, Function, Object, Value}, + to_js_error, Args, +}; +use std::collections::HashMap; +use std::sync::Arc; + +/// Create a BlessCrawl instance from JavaScript +pub fn bless_crawl(args: Args<'_>) -> Result> { + let (cx, args) = args.release(); + + // optional config can be provided in constructor + // Wrap in Arc for sharing across closures + let scrape_config = if args.len() > 0 { + let config_obj = args[0] + .as_object() + .ok_or_else(|| anyhow!("config must be an object"))?; + Arc::new(Some(parse_options(config_obj))) + } else { + Arc::new(None) + }; + + // Create JavaScript object wrapper + let instance = Object::new(cx.clone())?; + + // Clone Arc for the scrape closure + let scrape_config_clone = scrape_config.clone(); + instance.set( + "scrape", + Function::new( + cx.clone(), + MutFn::new(move |cx, args| { + let (cx, args) = hold_and_release!(cx, args); + + let scrape_fn = |args: Args<'_>| { + let (_cx, args) = args.release(); + + if args.is_empty() { + return Err(anyhow!("URL is required")); + } + + let url = args[0] + .as_string() + .ok_or_else(|| anyhow!("URL must be a string"))? + .to_string() + .map_err(|_| anyhow!("invalid UTF-8 in URL"))?; + + // Check if args[1] is provided, otherwise use scrape_config, otherwise error + let scrape_options = if args.len() > 1 { + let opts_obj = args[1] + .as_object() + .ok_or_else(|| anyhow!("options must be an object"))?; + parse_options(opts_obj) + } else if let Some(config) = scrape_config_clone.as_ref() { + config.clone() + } else { + return Err(anyhow!( + "No scrape options provided and no default config available" + )); + }; + + let result = BlessCrawl::with_config(scrape_options) + .map_err(|e| anyhow!("Failed to create BlessCrawl instance: {:?}", e))? + .scrape(&url, None) + .map_err(|e| anyhow!("Scrape failed: {:?}", e))?; + + // Serialize the result to JSON and parse it as a JavaScript value + let json_str = serde_json::to_string(&result) + .map_err(|e| anyhow!("Failed to serialize result: {:?}", e))?; + let js_code = format!("({})", json_str); + let js_value = cx.eval::(js_code.as_bytes()).map_err(|e| { + anyhow!("Failed to parse JSON as JavaScript value: {:?}", e) + })?; + + Ok(js_value) + }; + + scrape_fn(hold!(cx.clone(), args)).map_err(|e| to_js_error(cx, e)) + }), + ), + )?; + + // Add map method + let scrape_config_clone = scrape_config.clone(); + instance.set( + "map", + Function::new( + cx.clone(), + MutFn::new(move |cx, args| { + let (cx, args) = hold_and_release!(cx, args); + + let map_fn = |args: Args<'_>| { + let (_cx, args) = args.release(); + + if args.is_empty() { + return Err(anyhow!("URL is required")); + } + + let url = args[0] + .as_string() + .ok_or_else(|| anyhow!("URL must be a string"))? + .to_string() + .map_err(|_| anyhow!("invalid UTF-8 in URL"))?; + + let (scrape_options, map_options) = if args.len() > 1 { + let opts_obj = args[1] + .as_object() + .ok_or_else(|| anyhow!("options must be an object"))?; + (parse_options(opts_obj), parse_map_options(opts_obj)) + } else if let Some(config) = scrape_config_clone.as_ref() { + (config.clone(), MapOptions::default()) + } else { + return Err(anyhow!( + "No scrape options provided and no default config available" + )); + }; + + let result = BlessCrawl::with_config(scrape_options) + .map_err(|e| anyhow!("Failed to create BlessCrawl instance: {:?}", e))? + .map(&url, Some(map_options)) + .map_err(|e| anyhow!("Map failed: {:?}", e))?; + + let json_str = serde_json::to_string(&result) + .map_err(|e| anyhow!("Failed to serialize result: {:?}", e))?; + + let js_code = format!("({})", json_str); + let js_value = cx.eval::(js_code.as_bytes()).map_err(|e| { + anyhow!("Failed to parse JSON as JavaScript value: {:?}", e) + })?; + + Ok(js_value) + }; + + map_fn(hold!(cx.clone(), args)).map_err(|e| to_js_error(cx, e)) + }), + ), + )?; + + // Add crawl method + let scrape_config_clone = scrape_config.clone(); + instance.set( + "crawl", + Function::new( + cx.clone(), + MutFn::new(move |cx, args| { + let (cx, args) = hold_and_release!(cx, args); + + let crawl_fn = |args: Args<'_>| { + let (_cx, args) = args.release(); + + if args.is_empty() { + return Err(anyhow!("URL is required")); + } + + let url = args[0] + .as_string() + .ok_or_else(|| anyhow!("URL must be a string"))? + .to_string() + .map_err(|_| anyhow!("invalid UTF-8 in URL"))?; + + let (scrape_options, crawl_options) = if args.len() > 1 { + let opts_obj = args[1] + .as_object() + .ok_or_else(|| anyhow!("options must be an object"))?; + (parse_options(opts_obj), parse_crawl_options(opts_obj)) + } else if let Some(config) = scrape_config_clone.as_ref() { + (config.clone(), CrawlOptions::default()) + } else { + return Err(anyhow!( + "No scrape options provided and no default config available" + )); + }; + + let result = BlessCrawl::with_config(scrape_options) + .map_err(|e| anyhow!("Failed to create BlessCrawl instance: {:?}", e))? + .crawl(&url, Some(crawl_options)) + .map_err(|e| anyhow!("Crawl failed: {:?}", e))?; + + let json_str = serde_json::to_string(&result) + .map_err(|e| anyhow!("Failed to serialize result: {:?}", e))?; + + let js_code = format!("({})", json_str); + let js_value = cx.eval::(js_code.as_bytes()).map_err(|e| { + anyhow!("Failed to parse JSON as JavaScript value: {:?}", e) + })?; + + Ok(js_value) + }; + + crawl_fn(hold!(cx.clone(), args)).map_err(|e| to_js_error(cx, e)) + }), + ), + )?; + + Ok(Value::from_object(instance)) +} + +/// Parse JavaScript object into ScrapeOptions +fn parse_options(obj: &Object) -> ScrapeOptions { + // Parse all scrape options from the object + let mut scrape_options = ScrapeOptions::default(); + if let Ok(Some(timeout)) = obj.get::<_, Option>("timeout") { + scrape_options.timeout = timeout as u32; + } + if let Ok(Some(wait_time)) = obj.get::<_, Option>("wait_time") { + scrape_options.wait_time = wait_time as u32; + } + if let Ok(Some(include_tags)) = obj.get::<_, Option>>("include_tags") { + scrape_options.include_tags = Some(include_tags); + } + if let Ok(Some(exclude_tags)) = obj.get::<_, Option>>("exclude_tags") { + scrape_options.exclude_tags = Some(exclude_tags); + } + if let Ok(Some(only_main_content)) = obj.get::<_, Option>("only_main_content") { + scrape_options.only_main_content = only_main_content; + } + if let Ok(Some(format)) = obj.get::<_, Option>("format") { + scrape_options.format = format.parse().unwrap_or_default(); + } + if let Ok(Some(viewport_obj)) = obj.get::<_, Option>("viewport") { + let mut viewport = Viewport::default(); + if let Ok(Some(width)) = viewport_obj.get::<_, Option>("width") { + viewport.width = Some(width); + } + if let Ok(Some(height)) = viewport_obj.get::<_, Option>("height") { + viewport.height = Some(height); + } + scrape_options.viewport = Some(viewport); + } + if let Ok(Some(user_agent)) = obj.get::<_, Option>("user_agent") { + scrape_options.user_agent = Some(user_agent); + } + if let Ok(Some(headers_obj)) = obj.get::<_, Option>>("headers") { + scrape_options.headers = Some(headers_obj); + } + scrape_options +} + +/// Parse JavaScript object into MapOptions +fn parse_map_options(obj: &Object) -> MapOptions { + let mut options = MapOptions::default(); + if let Ok(Some(base_url)) = obj.get::<_, Option>("base_url") { + options.base_url = Some(base_url); + } + if let Ok(Some(link_types)) = obj.get::<_, Option>>("link_types") { + options.link_types = Some(link_types); + } + if let Ok(Some(filter_extensions)) = obj.get::<_, Option>>("filter_extensions") { + options.filter_extensions = Some(filter_extensions); + } + options +} + +/// Parse JavaScript object into CrawlOptions +fn parse_crawl_options(obj: &Object) -> CrawlOptions { + let mut options = CrawlOptions::default(); + if let Ok(Some(limit)) = obj.get::<_, Option>("limit") { + options.limit = Some(limit as u32); + } + if let Ok(Some(max_depth)) = obj.get::<_, Option>("max_depth") { + options.max_depth = Some(max_depth); + } + if let Ok(Some(exclude_paths)) = obj.get::<_, Option>>("exclude_paths") { + options.exclude_paths = Some(exclude_paths); + } + if let Ok(Some(include_paths)) = obj.get::<_, Option>>("include_paths") { + options.include_paths = Some(include_paths); + } + if let Ok(Some(follow_external)) = obj.get::<_, Option>("follow_external") { + options.follow_external = Some(follow_external); + } + if let Ok(Some(delay)) = obj.get::<_, Option>("delay_between_requests") { + options.delay_between_requests = Some(delay); + } + if let Ok(Some(parallel)) = obj.get::<_, Option>("parallel_requests") { + options.parallel_requests = Some(parallel); + } + options +} diff --git a/src/lib.rs b/src/lib.rs index a5eeeaf..7301c54 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -8,6 +8,8 @@ use javy_plugin_api::{ Config, }; +#[cfg(feature = "crawl")] +pub mod crawl; #[cfg(feature = "crypto")] pub mod crypto; #[cfg(feature = "fetch")] @@ -19,6 +21,10 @@ pub mod wasi; #[cfg(feature = "crypto")] use crypto::bless_get_random_values; + +#[cfg(feature = "crawl")] +use crawl::bless_crawl; + #[cfg(feature = "fetch")] use fetch::bless_fetch_request; @@ -94,6 +100,18 @@ pub extern "C" fn initialize_runtime() { bind!(function, wasi_preview1_path_filestat_get); } + #[cfg(feature = "crawl")] + ctx.globals().set( + "BlessCrawl", + Function::new( + ctx.clone(), + MutFn::new(move |cx, args| { + let (cx, args) = hold_and_release!(cx, args); + bless_crawl(hold!(cx.clone(), args)).map_err(|e| to_js_error(cx, e)) + }), + )?, + )?; + #[cfg(feature = "llm")] ctx.globals().set( "BlessLLM", @@ -118,6 +136,8 @@ pub extern "C" fn initialize_runtime() { #[cfg(feature = "crypto")] ctx.eval::<(), _>(include_str!("crypto/crypto.js"))?; + #[cfg(feature = "crawl")] + ctx.eval::<(), _>(include_str!("crawl/crawl.js"))?; #[cfg(feature = "fetch")] ctx.eval::<(), _>(include_str!("fetch/fetch.js"))?; #[cfg(feature = "wasip1")]