added generic adapter
This commit is contained in:
Generated
+338
-1
@@ -19,6 +19,19 @@ dependencies = [
|
||||
"version_check",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "ahash"
|
||||
version = "0.8.12"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "5a15f179cd60c4584b8a8c596927aadc462e27f2ca70c04e0071964a73ba7a75"
|
||||
dependencies = [
|
||||
"cfg-if",
|
||||
"getrandom 0.3.4",
|
||||
"once_cell",
|
||||
"version_check",
|
||||
"zerocopy",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "aho-corasick"
|
||||
version = "1.1.4"
|
||||
@@ -404,6 +417,29 @@ dependencies = [
|
||||
"typenum",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "cssparser"
|
||||
version = "0.31.2"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "5b3df4f93e5fbbe73ec01ec8d3f68bba73107993a5b1e7519273c32db9b0d5be"
|
||||
dependencies = [
|
||||
"cssparser-macros",
|
||||
"dtoa-short",
|
||||
"itoa",
|
||||
"phf 0.11.3",
|
||||
"smallvec",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "cssparser-macros"
|
||||
version = "0.6.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "13b588ba4ac1a99f7f2964d24b3d896ddc6bf847ee3855dbd4366f058cfcd331"
|
||||
dependencies = [
|
||||
"quote",
|
||||
"syn 2.0.118",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "darling"
|
||||
version = "0.20.11"
|
||||
@@ -459,6 +495,17 @@ dependencies = [
|
||||
"serde_core",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "derive_more"
|
||||
version = "0.99.20"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "6edb4b64a43d977b8e99788fe3a04d483834fba1215a7e02caa415b626497f7f"
|
||||
dependencies = [
|
||||
"proc-macro2",
|
||||
"quote",
|
||||
"syn 2.0.118",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "digest"
|
||||
version = "0.10.7"
|
||||
@@ -488,6 +535,27 @@ version = "0.15.7"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "1aaf95b3e5c8f23aa320147307562d361db0ae0d51242340f558153b4eb2439b"
|
||||
|
||||
[[package]]
|
||||
name = "dtoa"
|
||||
version = "1.0.11"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "4c3cf4824e2d5f025c7b531afcb2325364084a16806f6d47fbc1f5fbd9960590"
|
||||
|
||||
[[package]]
|
||||
name = "dtoa-short"
|
||||
version = "0.3.5"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "cd1511a7b6a56299bd043a9c167a6d2bfb37bf84a6dfceaba651168adfb43c87"
|
||||
dependencies = [
|
||||
"dtoa",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "ego-tree"
|
||||
version = "0.6.3"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "12a0bb14ac04a9fcf170d0bbbef949b44cc492f4452bd20c095636956f653642"
|
||||
|
||||
[[package]]
|
||||
name = "either"
|
||||
version = "1.16.0"
|
||||
@@ -611,6 +679,16 @@ version = "2.0.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "e6d5a32815ae3f33302d95fdcb2ce17862f8c65363dcfd29360480ba1001fc9c"
|
||||
|
||||
[[package]]
|
||||
name = "futf"
|
||||
version = "0.1.5"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "df420e2e84819663797d1ec6544b13c5be84629e7bb00dc960d6917db2987843"
|
||||
dependencies = [
|
||||
"mac",
|
||||
"new_debug_unreachable",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "futures"
|
||||
version = "0.3.32"
|
||||
@@ -708,6 +786,15 @@ dependencies = [
|
||||
"slab",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "fxhash"
|
||||
version = "0.2.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "c31b6d751ae2c7f11320402d34e41349dd1016f8d5d45e48c4312bc8625af50c"
|
||||
dependencies = [
|
||||
"byteorder",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "generic-array"
|
||||
version = "0.14.7"
|
||||
@@ -718,6 +805,15 @@ dependencies = [
|
||||
"version_check",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "getopts"
|
||||
version = "0.2.24"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "cfe4fbac503b8d1f88e6676011885f34b7174f46e59956bba534ba83abded4df"
|
||||
dependencies = [
|
||||
"unicode-width",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "getrandom"
|
||||
version = "0.2.17"
|
||||
@@ -764,7 +860,7 @@ version = "0.12.3"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "8a9ee70c43aaf417c914396645a0fa852624801b24ebb7ae78fe8272889ac888"
|
||||
dependencies = [
|
||||
"ahash",
|
||||
"ahash 0.7.8",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
@@ -843,6 +939,20 @@ dependencies = [
|
||||
"windows-link",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "html5ever"
|
||||
version = "0.27.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "c13771afe0e6e846f1e67d038d4cb29998a6779f93c809212e4e9c32efd244d4"
|
||||
dependencies = [
|
||||
"log",
|
||||
"mac",
|
||||
"markup5ever",
|
||||
"proc-macro2",
|
||||
"quote",
|
||||
"syn 2.0.118",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "http"
|
||||
version = "1.4.2"
|
||||
@@ -1203,6 +1313,26 @@ version = "0.1.2"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "112b39cec0b298b6c1999fee3e31427f74f676e4cb9879ed1a121b43661a4154"
|
||||
|
||||
[[package]]
|
||||
name = "mac"
|
||||
version = "0.1.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "c41e0c4fef86961ac6d6f8a82609f55f31b05e4fce149ac5710e439df7619ba4"
|
||||
|
||||
[[package]]
|
||||
name = "markup5ever"
|
||||
version = "0.12.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "16ce3abbeba692c8b8441d036ef91aea6df8da2c6b6e21c7e14d3c18e526be45"
|
||||
dependencies = [
|
||||
"log",
|
||||
"phf 0.11.3",
|
||||
"phf_codegen 0.11.3",
|
||||
"string_cache",
|
||||
"string_cache_codegen",
|
||||
"tendril",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "matchers"
|
||||
version = "0.2.0"
|
||||
@@ -1261,6 +1391,12 @@ dependencies = [
|
||||
"windows-sys 0.61.2",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "new_debug_unreachable"
|
||||
version = "1.0.6"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "650eef8c711430f1a879fdd01d4745a7deea475becfb90269c06775983bbf086"
|
||||
|
||||
[[package]]
|
||||
name = "nom"
|
||||
version = "8.0.0"
|
||||
@@ -1392,6 +1528,96 @@ version = "2.3.2"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "9b4f627cb1b25917193a259e49bdad08f671f8d9708acfd5fe0a8c1455d87220"
|
||||
|
||||
[[package]]
|
||||
name = "phf"
|
||||
version = "0.10.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "fabbf1ead8a5bcbc20f5f8b939ee3f5b0f6f281b6ad3468b84656b658b455259"
|
||||
dependencies = [
|
||||
"phf_shared 0.10.0",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "phf"
|
||||
version = "0.11.3"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "1fd6780a80ae0c52cc120a26a1a42c1ae51b247a253e4e06113d23d2c2edd078"
|
||||
dependencies = [
|
||||
"phf_macros",
|
||||
"phf_shared 0.11.3",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "phf_codegen"
|
||||
version = "0.10.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "4fb1c3a8bc4dd4e5cfce29b44ffc14bedd2ee294559a294e2a4d4c9e9a6a13cd"
|
||||
dependencies = [
|
||||
"phf_generator 0.10.0",
|
||||
"phf_shared 0.10.0",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "phf_codegen"
|
||||
version = "0.11.3"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "aef8048c789fa5e851558d709946d6d79a8ff88c0440c587967f8e94bfb1216a"
|
||||
dependencies = [
|
||||
"phf_generator 0.11.3",
|
||||
"phf_shared 0.11.3",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "phf_generator"
|
||||
version = "0.10.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "5d5285893bb5eb82e6aaf5d59ee909a06a16737a8970984dd7746ba9283498d6"
|
||||
dependencies = [
|
||||
"phf_shared 0.10.0",
|
||||
"rand 0.8.6",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "phf_generator"
|
||||
version = "0.11.3"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "3c80231409c20246a13fddb31776fb942c38553c51e871f8cbd687a4cfb5843d"
|
||||
dependencies = [
|
||||
"phf_shared 0.11.3",
|
||||
"rand 0.8.6",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "phf_macros"
|
||||
version = "0.11.3"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "f84ac04429c13a7ff43785d75ad27569f2951ce0ffd30a3321230db2fc727216"
|
||||
dependencies = [
|
||||
"phf_generator 0.11.3",
|
||||
"phf_shared 0.11.3",
|
||||
"proc-macro2",
|
||||
"quote",
|
||||
"syn 2.0.118",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "phf_shared"
|
||||
version = "0.10.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "b6796ad771acdc0123d2a88dc428b5e38ef24456743ddb1744ed628f9815c096"
|
||||
dependencies = [
|
||||
"siphasher 0.3.11",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "phf_shared"
|
||||
version = "0.11.3"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "67eabc2ef2a60eb7faa00097bd1ffdb5bd28e62bf39990626a582201b7a754e5"
|
||||
dependencies = [
|
||||
"siphasher 1.0.3",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "pin-project-lite"
|
||||
version = "0.2.17"
|
||||
@@ -1455,6 +1681,12 @@ dependencies = [
|
||||
"zerocopy",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "precomputed-hash"
|
||||
version = "0.1.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "925383efa346730478fb4838dbe9137d2a47675ad789c546d150a6e1dd4ab31c"
|
||||
|
||||
[[package]]
|
||||
name = "prettyplease"
|
||||
version = "0.2.37"
|
||||
@@ -1925,12 +2157,47 @@ version = "1.2.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "94143f37725109f92c262ed2cf5e59bce7498c01bcc1502d7b9afe439a4e9f49"
|
||||
|
||||
[[package]]
|
||||
name = "scraper"
|
||||
version = "0.20.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "b90460b31bfe1fc07be8262e42c665ad97118d4585869de9345a84d501a9eaf0"
|
||||
dependencies = [
|
||||
"ahash 0.8.12",
|
||||
"cssparser",
|
||||
"ego-tree",
|
||||
"getopts",
|
||||
"html5ever",
|
||||
"once_cell",
|
||||
"selectors",
|
||||
"tendril",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "seahash"
|
||||
version = "4.1.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "1c107b6f4780854c8b126e228ea8869f4d7b71260f962fefb57b996b8959ba6b"
|
||||
|
||||
[[package]]
|
||||
name = "selectors"
|
||||
version = "0.25.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "4eb30575f3638fc8f6815f448d50cb1a2e255b0897985c8c59f4d37b72a07b06"
|
||||
dependencies = [
|
||||
"bitflags",
|
||||
"cssparser",
|
||||
"derive_more",
|
||||
"fxhash",
|
||||
"log",
|
||||
"new_debug_unreachable",
|
||||
"phf 0.10.1",
|
||||
"phf_codegen 0.10.0",
|
||||
"precomputed-hash",
|
||||
"servo_arc",
|
||||
"smallvec",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "semver"
|
||||
version = "1.0.28"
|
||||
@@ -2003,6 +2270,15 @@ dependencies = [
|
||||
"serde",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "servo_arc"
|
||||
version = "0.3.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "d036d71a959e00c77a63538b90a6c2390969f9772b096ea837205c6bd0491a44"
|
||||
dependencies = [
|
||||
"stable_deref_trait",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "sha1"
|
||||
version = "0.10.6"
|
||||
@@ -2053,6 +2329,7 @@ dependencies = [
|
||||
"rand 0.8.6",
|
||||
"reqwest",
|
||||
"rust_decimal",
|
||||
"scraper",
|
||||
"serde",
|
||||
"serde_json",
|
||||
"sha2",
|
||||
@@ -2103,6 +2380,18 @@ version = "0.1.5"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "e3a9fe34e3e7a50316060351f37187a3f546bce95496156754b601a5fa71b76e"
|
||||
|
||||
[[package]]
|
||||
name = "siphasher"
|
||||
version = "0.3.11"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "38b58827f4464d87d377d175e90bf58eb00fd8716ff0a62f80356b5e61555d0d"
|
||||
|
||||
[[package]]
|
||||
name = "siphasher"
|
||||
version = "1.0.3"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "8ee5873ec9cce0195efcb7a4e9507a04cd49aec9c83d0389df45b1ef7ba2e649"
|
||||
|
||||
[[package]]
|
||||
name = "slab"
|
||||
version = "0.4.12"
|
||||
@@ -2354,6 +2643,31 @@ version = "1.2.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "6ce2be8dc25455e1f91df71bfa12ad37d7af1092ae736f3a6cd0e37bc7810596"
|
||||
|
||||
[[package]]
|
||||
name = "string_cache"
|
||||
version = "0.8.9"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "bf776ba3fa74f83bf4b63c3dcbbf82173db2632ed8452cb2d891d33f459de70f"
|
||||
dependencies = [
|
||||
"new_debug_unreachable",
|
||||
"parking_lot",
|
||||
"phf_shared 0.11.3",
|
||||
"precomputed-hash",
|
||||
"serde",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "string_cache_codegen"
|
||||
version = "0.5.4"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "c711928715f1fe0fe509c53b43e993a9a557babc2d0a3567d0a3006f1ac931a0"
|
||||
dependencies = [
|
||||
"phf_generator 0.11.3",
|
||||
"phf_shared 0.11.3",
|
||||
"proc-macro2",
|
||||
"quote",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "stringprep"
|
||||
version = "0.1.5"
|
||||
@@ -2425,6 +2739,17 @@ version = "1.0.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "55937e1799185b12863d447f42597ed69d9928686b8d88a1df17376a097d8369"
|
||||
|
||||
[[package]]
|
||||
name = "tendril"
|
||||
version = "0.4.3"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "d24a120c5fc464a3458240ee02c299ebcb9d67b5249c8848b09d639dca8d7bb0"
|
||||
dependencies = [
|
||||
"futf",
|
||||
"mac",
|
||||
"utf-8",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "thiserror"
|
||||
version = "1.0.69"
|
||||
@@ -2855,6 +3180,12 @@ version = "0.1.4"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "7df058c713841ad818f1dc5d3fd88063241cc61f49f5fbea4b951e8cf5a8d71d"
|
||||
|
||||
[[package]]
|
||||
name = "unicode-width"
|
||||
version = "0.2.2"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "b4ac048d71ede7ee76d585517add45da530660ef4390e49b098733c6e897f254"
|
||||
|
||||
[[package]]
|
||||
name = "unicode-xid"
|
||||
version = "0.2.6"
|
||||
@@ -2879,6 +3210,12 @@ dependencies = [
|
||||
"serde",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "utf-8"
|
||||
version = "0.7.6"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "09cc8ee72d2a9becf2f2febe0205bbed8fc6615b7cb429ad062dc7b7ddd036a9"
|
||||
|
||||
[[package]]
|
||||
name = "utf8_iter"
|
||||
version = "1.0.4"
|
||||
|
||||
@@ -38,3 +38,4 @@ rust_decimal = { version = "1", features = ["serde-float"] }
|
||||
dotenvy = "0.15"
|
||||
reqwest = { version = "0.12", default-features = false, features = ["json", "rustls-tls", "gzip"] }
|
||||
url = "2"
|
||||
scraper = "0.20"
|
||||
|
||||
@@ -0,0 +1,280 @@
|
||||
//! Generic structured-data adapter.
|
||||
//!
|
||||
//! The Shopify adapter speaks a specific platform API. This one is the
|
||||
//! catch-all: it fetches the product page HTML and reads whatever standard
|
||||
//! product metadata it carries, in order of reliability:
|
||||
//!
|
||||
//! 1. **JSON-LD** (`<script type="application/ld+json">`) — a schema.org
|
||||
//! `Product` with an `offers` block. The richest, most common source.
|
||||
//! 2. **Microdata** — `itemprop="price"`, `priceCurrency`, `availability`
|
||||
//! attributes scattered on the page (schema.org again, inline).
|
||||
//! 3. **OpenGraph / meta** — `og:title`, `og:image`, `product:price:amount`.
|
||||
//!
|
||||
//! Each source fills gaps left by the previous, so a page with JSON-LD title
|
||||
//! but microdata price still resolves. No shop is hardcoded; this works on any
|
||||
//! store that emits standard product markup (Shopware, Magento, WooCommerce, …).
|
||||
|
||||
use std::str::FromStr;
|
||||
|
||||
use rust_decimal::Decimal;
|
||||
use scraper::{Html, Selector};
|
||||
|
||||
use super::FetchedProduct;
|
||||
|
||||
/// Fields gathered from the page, each independently optional.
|
||||
#[derive(Default)]
|
||||
struct Candidate {
|
||||
title: Option<String>,
|
||||
price: Option<Decimal>,
|
||||
currency: Option<String>,
|
||||
image: Option<String>,
|
||||
in_stock: Option<bool>,
|
||||
}
|
||||
|
||||
impl Candidate {
|
||||
/// Fill any still-empty field from `other` (earlier sources win).
|
||||
fn fill_from(&mut self, other: Candidate) {
|
||||
self.title = self.title.take().or(other.title);
|
||||
self.price = self.price.take().or(other.price);
|
||||
self.currency = self.currency.take().or(other.currency);
|
||||
self.image = self.image.take().or(other.image);
|
||||
self.in_stock = self.in_stock.take().or(other.in_stock);
|
||||
}
|
||||
}
|
||||
|
||||
/// Returns `Ok(None)` when the page carries no usable product data (so this is
|
||||
/// a true fallback that doesn't mask "couldn't read it" as success); `Err` on a
|
||||
/// network/HTTP failure.
|
||||
pub async fn fetch(
|
||||
client: &reqwest::Client,
|
||||
raw_url: &str,
|
||||
default_currency: &str,
|
||||
) -> anyhow::Result<Option<FetchedProduct>> {
|
||||
let resp = client.get(raw_url).send().await?;
|
||||
if !resp.status().is_success() {
|
||||
anyhow::bail!("product page returned HTTP {}", resp.status());
|
||||
}
|
||||
let body = resp.text().await?;
|
||||
|
||||
// Parsing the DOM is sync + uses non-Send types (scraper/Html), so confine
|
||||
// it to a closure with no awaits across it.
|
||||
let mut c = Candidate::default();
|
||||
{
|
||||
let doc = Html::parse_document(&body);
|
||||
// JSON-LD first (a properly-scoped Product, most reliable). Then OG meta
|
||||
// for title/image — those are explicitly the product, unlike loose
|
||||
// page-wide microdata `name`/`image`, which often hit a breadcrumb or
|
||||
// logo. Microdata last: it still supplies price/currency/availability
|
||||
// (which OG usually omits) without clobbering the good title/image.
|
||||
c.fill_from(from_json_ld(&doc));
|
||||
c.fill_from(from_meta(&doc));
|
||||
c.fill_from(from_microdata(&doc));
|
||||
}
|
||||
|
||||
let Some(price) = c.price else {
|
||||
return Ok(None);
|
||||
};
|
||||
|
||||
Ok(Some(FetchedProduct {
|
||||
title: c.title.unwrap_or_else(|| "Untitled product".to_string()),
|
||||
price,
|
||||
currency: c.currency.unwrap_or_else(|| default_currency.to_string()),
|
||||
image_url: c.image,
|
||||
in_stock: c.in_stock,
|
||||
source: "generic",
|
||||
}))
|
||||
}
|
||||
|
||||
/// schema.org Product from any JSON-LD block (handles bare objects, arrays and
|
||||
/// `@graph` wrappers).
|
||||
fn from_json_ld(doc: &Html) -> Candidate {
|
||||
let sel = Selector::parse(r#"script[type="application/ld+json"]"#).unwrap();
|
||||
for el in doc.select(&sel) {
|
||||
let raw = el.text().collect::<String>();
|
||||
let Ok(json) = serde_json::from_str::<serde_json::Value>(&raw) else {
|
||||
continue;
|
||||
};
|
||||
if let Some(node) = find_product(&json) {
|
||||
return product_from_json(node);
|
||||
}
|
||||
}
|
||||
Candidate::default()
|
||||
}
|
||||
|
||||
/// Depth-first search for a node whose `@type` is (or includes) `Product`.
|
||||
fn find_product(v: &serde_json::Value) -> Option<&serde_json::Value> {
|
||||
match v {
|
||||
serde_json::Value::Object(map) => {
|
||||
if let Some(t) = map.get("@type") {
|
||||
let is_product = match t {
|
||||
serde_json::Value::String(s) => s == "Product",
|
||||
serde_json::Value::Array(a) => {
|
||||
a.iter().any(|x| x.as_str() == Some("Product"))
|
||||
}
|
||||
_ => false,
|
||||
};
|
||||
if is_product {
|
||||
return Some(v);
|
||||
}
|
||||
}
|
||||
// Descend (covers `@graph`, `mainEntity`, nested arrays, …).
|
||||
map.values().find_map(find_product)
|
||||
}
|
||||
serde_json::Value::Array(a) => a.iter().find_map(find_product),
|
||||
_ => None,
|
||||
}
|
||||
}
|
||||
|
||||
fn product_from_json(p: &serde_json::Value) -> Candidate {
|
||||
// `offers` may be an object or an array of offers; take the first.
|
||||
let offer = match p.get("offers") {
|
||||
Some(serde_json::Value::Array(a)) => a.first(),
|
||||
other => other,
|
||||
};
|
||||
|
||||
let price = offer
|
||||
.and_then(|o| o.get("price"))
|
||||
.and_then(json_to_price)
|
||||
.or_else(|| {
|
||||
offer
|
||||
.and_then(|o| o.get("priceSpecification"))
|
||||
.and_then(|s| s.get("price"))
|
||||
.and_then(json_to_price)
|
||||
});
|
||||
|
||||
let currency = offer
|
||||
.and_then(|o| o.get("priceCurrency"))
|
||||
.and_then(|c| c.as_str())
|
||||
.map(normalize_currency);
|
||||
|
||||
let in_stock = offer
|
||||
.and_then(|o| o.get("availability"))
|
||||
.and_then(|a| a.as_str())
|
||||
.map(availability_in_stock);
|
||||
|
||||
Candidate {
|
||||
title: p.get("name").and_then(|n| n.as_str()).map(str::to_string),
|
||||
price,
|
||||
currency,
|
||||
image: p.get("image").and_then(json_first_string),
|
||||
in_stock,
|
||||
}
|
||||
}
|
||||
|
||||
/// Inline schema.org microdata (`itemprop` attributes).
|
||||
fn from_microdata(doc: &Html) -> Candidate {
|
||||
Candidate {
|
||||
title: itemprop_value(doc, "name"),
|
||||
price: itemprop_value(doc, "price").and_then(|s| parse_price(&s)),
|
||||
currency: itemprop_value(doc, "priceCurrency").map(|s| normalize_currency(&s)),
|
||||
image: itemprop_value(doc, "image"),
|
||||
in_stock: itemprop_availability(doc).map(|s| availability_in_stock(&s)),
|
||||
}
|
||||
}
|
||||
|
||||
/// OpenGraph / product meta tags.
|
||||
fn from_meta(doc: &Html) -> Candidate {
|
||||
Candidate {
|
||||
title: meta_content(doc, "property", "og:title"),
|
||||
price: meta_content(doc, "property", "product:price:amount")
|
||||
.and_then(|s| parse_price(&s)),
|
||||
currency: meta_content(doc, "property", "product:price:currency")
|
||||
.map(|s| normalize_currency(&s)),
|
||||
image: meta_content(doc, "property", "og:image"),
|
||||
in_stock: meta_content(doc, "property", "og:availability")
|
||||
.or_else(|| meta_content(doc, "property", "product:availability"))
|
||||
.map(|s| availability_in_stock(&s)),
|
||||
}
|
||||
}
|
||||
|
||||
// --- small extraction helpers -------------------------------------------------
|
||||
|
||||
/// Value of an `itemprop` element: its `content` attribute if present, else its
|
||||
/// trimmed text.
|
||||
fn itemprop_value(doc: &Html, prop: &str) -> Option<String> {
|
||||
let sel = Selector::parse(&format!(r#"[itemprop="{prop}"]"#)).ok()?;
|
||||
let el = doc.select(&sel).next()?;
|
||||
if let Some(c) = el.value().attr("content") {
|
||||
let c = c.trim();
|
||||
if !c.is_empty() {
|
||||
return Some(c.to_string());
|
||||
}
|
||||
}
|
||||
let txt = el.text().collect::<String>();
|
||||
let txt = txt.trim();
|
||||
(!txt.is_empty()).then(|| txt.to_string())
|
||||
}
|
||||
|
||||
/// Availability is carried in `content` or `href` (`href="…/InStock"`).
|
||||
fn itemprop_availability(doc: &Html) -> Option<String> {
|
||||
let sel = Selector::parse(r#"[itemprop="availability"]"#).ok()?;
|
||||
let el = doc.select(&sel).next()?;
|
||||
el.value()
|
||||
.attr("content")
|
||||
.or_else(|| el.value().attr("href"))
|
||||
.map(str::to_string)
|
||||
}
|
||||
|
||||
fn meta_content(doc: &Html, attr: &str, key: &str) -> Option<String> {
|
||||
let sel = Selector::parse(&format!(r#"meta[{attr}="{key}"]"#)).ok()?;
|
||||
let v = doc.select(&sel).next()?.value().attr("content")?.trim();
|
||||
(!v.is_empty()).then(|| v.to_string())
|
||||
}
|
||||
|
||||
fn json_to_price(v: &serde_json::Value) -> Option<Decimal> {
|
||||
match v {
|
||||
serde_json::Value::String(s) => parse_price(s),
|
||||
serde_json::Value::Number(n) => Decimal::from_str(&n.to_string()).ok(),
|
||||
_ => None,
|
||||
}
|
||||
}
|
||||
|
||||
/// `image` may be a string, an array of strings, or an `ImageObject`.
|
||||
fn json_first_string(v: &serde_json::Value) -> Option<String> {
|
||||
match v {
|
||||
serde_json::Value::String(s) => Some(s.clone()),
|
||||
serde_json::Value::Array(a) => a.iter().find_map(json_first_string),
|
||||
serde_json::Value::Object(o) => o.get("url").and_then(json_first_string),
|
||||
_ => None,
|
||||
}
|
||||
}
|
||||
|
||||
/// A schema.org availability URL/string maps to in-stock truth.
|
||||
fn availability_in_stock(s: &str) -> bool {
|
||||
let s = s.to_ascii_lowercase();
|
||||
// Treat InStock / PreOrder / BackOrder as "can be had"; everything else
|
||||
// (OutOfStock, SoldOut, Discontinued) as not.
|
||||
s.contains("instock") || s.contains("preorder") || s.contains("backorder")
|
||||
}
|
||||
|
||||
fn normalize_currency(s: &str) -> String {
|
||||
s.trim().to_uppercase()
|
||||
}
|
||||
|
||||
/// Parse a price string like `17.95`, `1.234,56`, `€ 17,95` into a Decimal.
|
||||
fn parse_price(raw: &str) -> Option<Decimal> {
|
||||
// Keep only digits and separators.
|
||||
let mut s: String = raw
|
||||
.chars()
|
||||
.filter(|c| c.is_ascii_digit() || *c == '.' || *c == ',')
|
||||
.collect();
|
||||
if s.is_empty() {
|
||||
return None;
|
||||
}
|
||||
match (s.rfind('.'), s.rfind(',')) {
|
||||
// Both present: the rightmost separator is the decimal point; the other
|
||||
// is a thousands separator and gets stripped.
|
||||
(Some(dot), Some(comma)) => {
|
||||
if comma > dot {
|
||||
s = s.replace('.', "").replace(',', ".");
|
||||
} else {
|
||||
s = s.replace(',', "");
|
||||
}
|
||||
}
|
||||
// Only a comma: it's the decimal separator (European style).
|
||||
(None, Some(_)) => s = s.replace(',', "."),
|
||||
// Only a dot, or neither: already parseable.
|
||||
_ => {}
|
||||
}
|
||||
Decimal::from_str(&s).ok()
|
||||
}
|
||||
@@ -4,6 +4,7 @@
|
||||
|
||||
use rust_decimal::Decimal;
|
||||
|
||||
mod generic;
|
||||
mod shopify;
|
||||
|
||||
/// Normalised product snapshot produced by an adapter.
|
||||
@@ -35,5 +36,10 @@ pub async fn fetch_product(
|
||||
if let Some(p) = shopify::fetch(client, url, default_currency).await? {
|
||||
return Ok(p);
|
||||
}
|
||||
anyhow::bail!("no adapter could read this URL (only Shopify storefronts are supported for now)")
|
||||
// Catch-all: read standard product metadata (JSON-LD / microdata / OG) from
|
||||
// the page HTML. Works on any store emitting schema.org markup.
|
||||
if let Some(p) = generic::fetch(client, url, default_currency).await? {
|
||||
return Ok(p);
|
||||
}
|
||||
anyhow::bail!("no adapter could read this URL (no Shopify API and no readable product metadata)")
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user