added generic adapter

This commit is contained in:
2026-06-17 16:57:45 +02:00
parent f4e357a8a1
commit e38f0a3e22
4 changed files with 626 additions and 2 deletions
+338 -1
View File
@@ -19,6 +19,19 @@ dependencies = [
"version_check", "version_check",
] ]
[[package]]
name = "ahash"
version = "0.8.12"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "5a15f179cd60c4584b8a8c596927aadc462e27f2ca70c04e0071964a73ba7a75"
dependencies = [
"cfg-if",
"getrandom 0.3.4",
"once_cell",
"version_check",
"zerocopy",
]
[[package]] [[package]]
name = "aho-corasick" name = "aho-corasick"
version = "1.1.4" version = "1.1.4"
@@ -404,6 +417,29 @@ dependencies = [
"typenum", "typenum",
] ]
[[package]]
name = "cssparser"
version = "0.31.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "5b3df4f93e5fbbe73ec01ec8d3f68bba73107993a5b1e7519273c32db9b0d5be"
dependencies = [
"cssparser-macros",
"dtoa-short",
"itoa",
"phf 0.11.3",
"smallvec",
]
[[package]]
name = "cssparser-macros"
version = "0.6.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "13b588ba4ac1a99f7f2964d24b3d896ddc6bf847ee3855dbd4366f058cfcd331"
dependencies = [
"quote",
"syn 2.0.118",
]
[[package]] [[package]]
name = "darling" name = "darling"
version = "0.20.11" version = "0.20.11"
@@ -459,6 +495,17 @@ dependencies = [
"serde_core", "serde_core",
] ]
[[package]]
name = "derive_more"
version = "0.99.20"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "6edb4b64a43d977b8e99788fe3a04d483834fba1215a7e02caa415b626497f7f"
dependencies = [
"proc-macro2",
"quote",
"syn 2.0.118",
]
[[package]] [[package]]
name = "digest" name = "digest"
version = "0.10.7" version = "0.10.7"
@@ -488,6 +535,27 @@ version = "0.15.7"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "1aaf95b3e5c8f23aa320147307562d361db0ae0d51242340f558153b4eb2439b" checksum = "1aaf95b3e5c8f23aa320147307562d361db0ae0d51242340f558153b4eb2439b"
[[package]]
name = "dtoa"
version = "1.0.11"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "4c3cf4824e2d5f025c7b531afcb2325364084a16806f6d47fbc1f5fbd9960590"
[[package]]
name = "dtoa-short"
version = "0.3.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "cd1511a7b6a56299bd043a9c167a6d2bfb37bf84a6dfceaba651168adfb43c87"
dependencies = [
"dtoa",
]
[[package]]
name = "ego-tree"
version = "0.6.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "12a0bb14ac04a9fcf170d0bbbef949b44cc492f4452bd20c095636956f653642"
[[package]] [[package]]
name = "either" name = "either"
version = "1.16.0" version = "1.16.0"
@@ -611,6 +679,16 @@ version = "2.0.0"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "e6d5a32815ae3f33302d95fdcb2ce17862f8c65363dcfd29360480ba1001fc9c" checksum = "e6d5a32815ae3f33302d95fdcb2ce17862f8c65363dcfd29360480ba1001fc9c"
[[package]]
name = "futf"
version = "0.1.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "df420e2e84819663797d1ec6544b13c5be84629e7bb00dc960d6917db2987843"
dependencies = [
"mac",
"new_debug_unreachable",
]
[[package]] [[package]]
name = "futures" name = "futures"
version = "0.3.32" version = "0.3.32"
@@ -708,6 +786,15 @@ dependencies = [
"slab", "slab",
] ]
[[package]]
name = "fxhash"
version = "0.2.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "c31b6d751ae2c7f11320402d34e41349dd1016f8d5d45e48c4312bc8625af50c"
dependencies = [
"byteorder",
]
[[package]] [[package]]
name = "generic-array" name = "generic-array"
version = "0.14.7" version = "0.14.7"
@@ -718,6 +805,15 @@ dependencies = [
"version_check", "version_check",
] ]
[[package]]
name = "getopts"
version = "0.2.24"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "cfe4fbac503b8d1f88e6676011885f34b7174f46e59956bba534ba83abded4df"
dependencies = [
"unicode-width",
]
[[package]] [[package]]
name = "getrandom" name = "getrandom"
version = "0.2.17" version = "0.2.17"
@@ -764,7 +860,7 @@ version = "0.12.3"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "8a9ee70c43aaf417c914396645a0fa852624801b24ebb7ae78fe8272889ac888" checksum = "8a9ee70c43aaf417c914396645a0fa852624801b24ebb7ae78fe8272889ac888"
dependencies = [ dependencies = [
"ahash", "ahash 0.7.8",
] ]
[[package]] [[package]]
@@ -843,6 +939,20 @@ dependencies = [
"windows-link", "windows-link",
] ]
[[package]]
name = "html5ever"
version = "0.27.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "c13771afe0e6e846f1e67d038d4cb29998a6779f93c809212e4e9c32efd244d4"
dependencies = [
"log",
"mac",
"markup5ever",
"proc-macro2",
"quote",
"syn 2.0.118",
]
[[package]] [[package]]
name = "http" name = "http"
version = "1.4.2" version = "1.4.2"
@@ -1203,6 +1313,26 @@ version = "0.1.2"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "112b39cec0b298b6c1999fee3e31427f74f676e4cb9879ed1a121b43661a4154" checksum = "112b39cec0b298b6c1999fee3e31427f74f676e4cb9879ed1a121b43661a4154"
[[package]]
name = "mac"
version = "0.1.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "c41e0c4fef86961ac6d6f8a82609f55f31b05e4fce149ac5710e439df7619ba4"
[[package]]
name = "markup5ever"
version = "0.12.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "16ce3abbeba692c8b8441d036ef91aea6df8da2c6b6e21c7e14d3c18e526be45"
dependencies = [
"log",
"phf 0.11.3",
"phf_codegen 0.11.3",
"string_cache",
"string_cache_codegen",
"tendril",
]
[[package]] [[package]]
name = "matchers" name = "matchers"
version = "0.2.0" version = "0.2.0"
@@ -1261,6 +1391,12 @@ dependencies = [
"windows-sys 0.61.2", "windows-sys 0.61.2",
] ]
[[package]]
name = "new_debug_unreachable"
version = "1.0.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "650eef8c711430f1a879fdd01d4745a7deea475becfb90269c06775983bbf086"
[[package]] [[package]]
name = "nom" name = "nom"
version = "8.0.0" version = "8.0.0"
@@ -1392,6 +1528,96 @@ version = "2.3.2"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "9b4f627cb1b25917193a259e49bdad08f671f8d9708acfd5fe0a8c1455d87220" checksum = "9b4f627cb1b25917193a259e49bdad08f671f8d9708acfd5fe0a8c1455d87220"
[[package]]
name = "phf"
version = "0.10.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "fabbf1ead8a5bcbc20f5f8b939ee3f5b0f6f281b6ad3468b84656b658b455259"
dependencies = [
"phf_shared 0.10.0",
]
[[package]]
name = "phf"
version = "0.11.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "1fd6780a80ae0c52cc120a26a1a42c1ae51b247a253e4e06113d23d2c2edd078"
dependencies = [
"phf_macros",
"phf_shared 0.11.3",
]
[[package]]
name = "phf_codegen"
version = "0.10.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "4fb1c3a8bc4dd4e5cfce29b44ffc14bedd2ee294559a294e2a4d4c9e9a6a13cd"
dependencies = [
"phf_generator 0.10.0",
"phf_shared 0.10.0",
]
[[package]]
name = "phf_codegen"
version = "0.11.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "aef8048c789fa5e851558d709946d6d79a8ff88c0440c587967f8e94bfb1216a"
dependencies = [
"phf_generator 0.11.3",
"phf_shared 0.11.3",
]
[[package]]
name = "phf_generator"
version = "0.10.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "5d5285893bb5eb82e6aaf5d59ee909a06a16737a8970984dd7746ba9283498d6"
dependencies = [
"phf_shared 0.10.0",
"rand 0.8.6",
]
[[package]]
name = "phf_generator"
version = "0.11.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "3c80231409c20246a13fddb31776fb942c38553c51e871f8cbd687a4cfb5843d"
dependencies = [
"phf_shared 0.11.3",
"rand 0.8.6",
]
[[package]]
name = "phf_macros"
version = "0.11.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "f84ac04429c13a7ff43785d75ad27569f2951ce0ffd30a3321230db2fc727216"
dependencies = [
"phf_generator 0.11.3",
"phf_shared 0.11.3",
"proc-macro2",
"quote",
"syn 2.0.118",
]
[[package]]
name = "phf_shared"
version = "0.10.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b6796ad771acdc0123d2a88dc428b5e38ef24456743ddb1744ed628f9815c096"
dependencies = [
"siphasher 0.3.11",
]
[[package]]
name = "phf_shared"
version = "0.11.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "67eabc2ef2a60eb7faa00097bd1ffdb5bd28e62bf39990626a582201b7a754e5"
dependencies = [
"siphasher 1.0.3",
]
[[package]] [[package]]
name = "pin-project-lite" name = "pin-project-lite"
version = "0.2.17" version = "0.2.17"
@@ -1455,6 +1681,12 @@ dependencies = [
"zerocopy", "zerocopy",
] ]
[[package]]
name = "precomputed-hash"
version = "0.1.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "925383efa346730478fb4838dbe9137d2a47675ad789c546d150a6e1dd4ab31c"
[[package]] [[package]]
name = "prettyplease" name = "prettyplease"
version = "0.2.37" version = "0.2.37"
@@ -1925,12 +2157,47 @@ version = "1.2.0"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "94143f37725109f92c262ed2cf5e59bce7498c01bcc1502d7b9afe439a4e9f49" checksum = "94143f37725109f92c262ed2cf5e59bce7498c01bcc1502d7b9afe439a4e9f49"
[[package]]
name = "scraper"
version = "0.20.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b90460b31bfe1fc07be8262e42c665ad97118d4585869de9345a84d501a9eaf0"
dependencies = [
"ahash 0.8.12",
"cssparser",
"ego-tree",
"getopts",
"html5ever",
"once_cell",
"selectors",
"tendril",
]
[[package]] [[package]]
name = "seahash" name = "seahash"
version = "4.1.0" version = "4.1.0"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "1c107b6f4780854c8b126e228ea8869f4d7b71260f962fefb57b996b8959ba6b" checksum = "1c107b6f4780854c8b126e228ea8869f4d7b71260f962fefb57b996b8959ba6b"
[[package]]
name = "selectors"
version = "0.25.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "4eb30575f3638fc8f6815f448d50cb1a2e255b0897985c8c59f4d37b72a07b06"
dependencies = [
"bitflags",
"cssparser",
"derive_more",
"fxhash",
"log",
"new_debug_unreachable",
"phf 0.10.1",
"phf_codegen 0.10.0",
"precomputed-hash",
"servo_arc",
"smallvec",
]
[[package]] [[package]]
name = "semver" name = "semver"
version = "1.0.28" version = "1.0.28"
@@ -2003,6 +2270,15 @@ dependencies = [
"serde", "serde",
] ]
[[package]]
name = "servo_arc"
version = "0.3.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d036d71a959e00c77a63538b90a6c2390969f9772b096ea837205c6bd0491a44"
dependencies = [
"stable_deref_trait",
]
[[package]] [[package]]
name = "sha1" name = "sha1"
version = "0.10.6" version = "0.10.6"
@@ -2053,6 +2329,7 @@ dependencies = [
"rand 0.8.6", "rand 0.8.6",
"reqwest", "reqwest",
"rust_decimal", "rust_decimal",
"scraper",
"serde", "serde",
"serde_json", "serde_json",
"sha2", "sha2",
@@ -2103,6 +2380,18 @@ version = "0.1.5"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "e3a9fe34e3e7a50316060351f37187a3f546bce95496156754b601a5fa71b76e" checksum = "e3a9fe34e3e7a50316060351f37187a3f546bce95496156754b601a5fa71b76e"
[[package]]
name = "siphasher"
version = "0.3.11"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "38b58827f4464d87d377d175e90bf58eb00fd8716ff0a62f80356b5e61555d0d"
[[package]]
name = "siphasher"
version = "1.0.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "8ee5873ec9cce0195efcb7a4e9507a04cd49aec9c83d0389df45b1ef7ba2e649"
[[package]] [[package]]
name = "slab" name = "slab"
version = "0.4.12" version = "0.4.12"
@@ -2354,6 +2643,31 @@ version = "1.2.1"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "6ce2be8dc25455e1f91df71bfa12ad37d7af1092ae736f3a6cd0e37bc7810596" checksum = "6ce2be8dc25455e1f91df71bfa12ad37d7af1092ae736f3a6cd0e37bc7810596"
[[package]]
name = "string_cache"
version = "0.8.9"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "bf776ba3fa74f83bf4b63c3dcbbf82173db2632ed8452cb2d891d33f459de70f"
dependencies = [
"new_debug_unreachable",
"parking_lot",
"phf_shared 0.11.3",
"precomputed-hash",
"serde",
]
[[package]]
name = "string_cache_codegen"
version = "0.5.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "c711928715f1fe0fe509c53b43e993a9a557babc2d0a3567d0a3006f1ac931a0"
dependencies = [
"phf_generator 0.11.3",
"phf_shared 0.11.3",
"proc-macro2",
"quote",
]
[[package]] [[package]]
name = "stringprep" name = "stringprep"
version = "0.1.5" version = "0.1.5"
@@ -2425,6 +2739,17 @@ version = "1.0.1"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "55937e1799185b12863d447f42597ed69d9928686b8d88a1df17376a097d8369" checksum = "55937e1799185b12863d447f42597ed69d9928686b8d88a1df17376a097d8369"
[[package]]
name = "tendril"
version = "0.4.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d24a120c5fc464a3458240ee02c299ebcb9d67b5249c8848b09d639dca8d7bb0"
dependencies = [
"futf",
"mac",
"utf-8",
]
[[package]] [[package]]
name = "thiserror" name = "thiserror"
version = "1.0.69" version = "1.0.69"
@@ -2855,6 +3180,12 @@ version = "0.1.4"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "7df058c713841ad818f1dc5d3fd88063241cc61f49f5fbea4b951e8cf5a8d71d" checksum = "7df058c713841ad818f1dc5d3fd88063241cc61f49f5fbea4b951e8cf5a8d71d"
[[package]]
name = "unicode-width"
version = "0.2.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b4ac048d71ede7ee76d585517add45da530660ef4390e49b098733c6e897f254"
[[package]] [[package]]
name = "unicode-xid" name = "unicode-xid"
version = "0.2.6" version = "0.2.6"
@@ -2879,6 +3210,12 @@ dependencies = [
"serde", "serde",
] ]
[[package]]
name = "utf-8"
version = "0.7.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "09cc8ee72d2a9becf2f2febe0205bbed8fc6615b7cb429ad062dc7b7ddd036a9"
[[package]] [[package]]
name = "utf8_iter" name = "utf8_iter"
version = "1.0.4" version = "1.0.4"
+1
View File
@@ -38,3 +38,4 @@ rust_decimal = { version = "1", features = ["serde-float"] }
dotenvy = "0.15" dotenvy = "0.15"
reqwest = { version = "0.12", default-features = false, features = ["json", "rustls-tls", "gzip"] } reqwest = { version = "0.12", default-features = false, features = ["json", "rustls-tls", "gzip"] }
url = "2" url = "2"
scraper = "0.20"
+280
View File
@@ -0,0 +1,280 @@
//! Generic structured-data adapter.
//!
//! The Shopify adapter speaks a specific platform API. This one is the
//! catch-all: it fetches the product page HTML and reads whatever standard
//! product metadata it carries, in order of reliability:
//!
//! 1. **JSON-LD** (`<script type="application/ld+json">`) — a schema.org
//! `Product` with an `offers` block. The richest, most common source.
//! 2. **Microdata** — `itemprop="price"`, `priceCurrency`, `availability`
//! attributes scattered on the page (schema.org again, inline).
//! 3. **OpenGraph / meta** — `og:title`, `og:image`, `product:price:amount`.
//!
//! Each source fills gaps left by the previous, so a page with JSON-LD title
//! but microdata price still resolves. No shop is hardcoded; this works on any
//! store that emits standard product markup (Shopware, Magento, WooCommerce, …).
use std::str::FromStr;
use rust_decimal::Decimal;
use scraper::{Html, Selector};
use super::FetchedProduct;
/// Fields gathered from the page, each independently optional.
#[derive(Default)]
struct Candidate {
title: Option<String>,
price: Option<Decimal>,
currency: Option<String>,
image: Option<String>,
in_stock: Option<bool>,
}
impl Candidate {
/// Fill any still-empty field from `other` (earlier sources win).
fn fill_from(&mut self, other: Candidate) {
self.title = self.title.take().or(other.title);
self.price = self.price.take().or(other.price);
self.currency = self.currency.take().or(other.currency);
self.image = self.image.take().or(other.image);
self.in_stock = self.in_stock.take().or(other.in_stock);
}
}
/// Returns `Ok(None)` when the page carries no usable product data (so this is
/// a true fallback that doesn't mask "couldn't read it" as success); `Err` on a
/// network/HTTP failure.
pub async fn fetch(
client: &reqwest::Client,
raw_url: &str,
default_currency: &str,
) -> anyhow::Result<Option<FetchedProduct>> {
let resp = client.get(raw_url).send().await?;
if !resp.status().is_success() {
anyhow::bail!("product page returned HTTP {}", resp.status());
}
let body = resp.text().await?;
// Parsing the DOM is sync + uses non-Send types (scraper/Html), so confine
// it to a closure with no awaits across it.
let mut c = Candidate::default();
{
let doc = Html::parse_document(&body);
// JSON-LD first (a properly-scoped Product, most reliable). Then OG meta
// for title/image — those are explicitly the product, unlike loose
// page-wide microdata `name`/`image`, which often hit a breadcrumb or
// logo. Microdata last: it still supplies price/currency/availability
// (which OG usually omits) without clobbering the good title/image.
c.fill_from(from_json_ld(&doc));
c.fill_from(from_meta(&doc));
c.fill_from(from_microdata(&doc));
}
let Some(price) = c.price else {
return Ok(None);
};
Ok(Some(FetchedProduct {
title: c.title.unwrap_or_else(|| "Untitled product".to_string()),
price,
currency: c.currency.unwrap_or_else(|| default_currency.to_string()),
image_url: c.image,
in_stock: c.in_stock,
source: "generic",
}))
}
/// schema.org Product from any JSON-LD block (handles bare objects, arrays and
/// `@graph` wrappers).
fn from_json_ld(doc: &Html) -> Candidate {
let sel = Selector::parse(r#"script[type="application/ld+json"]"#).unwrap();
for el in doc.select(&sel) {
let raw = el.text().collect::<String>();
let Ok(json) = serde_json::from_str::<serde_json::Value>(&raw) else {
continue;
};
if let Some(node) = find_product(&json) {
return product_from_json(node);
}
}
Candidate::default()
}
/// Depth-first search for a node whose `@type` is (or includes) `Product`.
fn find_product(v: &serde_json::Value) -> Option<&serde_json::Value> {
match v {
serde_json::Value::Object(map) => {
if let Some(t) = map.get("@type") {
let is_product = match t {
serde_json::Value::String(s) => s == "Product",
serde_json::Value::Array(a) => {
a.iter().any(|x| x.as_str() == Some("Product"))
}
_ => false,
};
if is_product {
return Some(v);
}
}
// Descend (covers `@graph`, `mainEntity`, nested arrays, …).
map.values().find_map(find_product)
}
serde_json::Value::Array(a) => a.iter().find_map(find_product),
_ => None,
}
}
fn product_from_json(p: &serde_json::Value) -> Candidate {
// `offers` may be an object or an array of offers; take the first.
let offer = match p.get("offers") {
Some(serde_json::Value::Array(a)) => a.first(),
other => other,
};
let price = offer
.and_then(|o| o.get("price"))
.and_then(json_to_price)
.or_else(|| {
offer
.and_then(|o| o.get("priceSpecification"))
.and_then(|s| s.get("price"))
.and_then(json_to_price)
});
let currency = offer
.and_then(|o| o.get("priceCurrency"))
.and_then(|c| c.as_str())
.map(normalize_currency);
let in_stock = offer
.and_then(|o| o.get("availability"))
.and_then(|a| a.as_str())
.map(availability_in_stock);
Candidate {
title: p.get("name").and_then(|n| n.as_str()).map(str::to_string),
price,
currency,
image: p.get("image").and_then(json_first_string),
in_stock,
}
}
/// Inline schema.org microdata (`itemprop` attributes).
fn from_microdata(doc: &Html) -> Candidate {
Candidate {
title: itemprop_value(doc, "name"),
price: itemprop_value(doc, "price").and_then(|s| parse_price(&s)),
currency: itemprop_value(doc, "priceCurrency").map(|s| normalize_currency(&s)),
image: itemprop_value(doc, "image"),
in_stock: itemprop_availability(doc).map(|s| availability_in_stock(&s)),
}
}
/// OpenGraph / product meta tags.
fn from_meta(doc: &Html) -> Candidate {
Candidate {
title: meta_content(doc, "property", "og:title"),
price: meta_content(doc, "property", "product:price:amount")
.and_then(|s| parse_price(&s)),
currency: meta_content(doc, "property", "product:price:currency")
.map(|s| normalize_currency(&s)),
image: meta_content(doc, "property", "og:image"),
in_stock: meta_content(doc, "property", "og:availability")
.or_else(|| meta_content(doc, "property", "product:availability"))
.map(|s| availability_in_stock(&s)),
}
}
// --- small extraction helpers -------------------------------------------------
/// Value of an `itemprop` element: its `content` attribute if present, else its
/// trimmed text.
fn itemprop_value(doc: &Html, prop: &str) -> Option<String> {
let sel = Selector::parse(&format!(r#"[itemprop="{prop}"]"#)).ok()?;
let el = doc.select(&sel).next()?;
if let Some(c) = el.value().attr("content") {
let c = c.trim();
if !c.is_empty() {
return Some(c.to_string());
}
}
let txt = el.text().collect::<String>();
let txt = txt.trim();
(!txt.is_empty()).then(|| txt.to_string())
}
/// Availability is carried in `content` or `href` (`href="…/InStock"`).
fn itemprop_availability(doc: &Html) -> Option<String> {
let sel = Selector::parse(r#"[itemprop="availability"]"#).ok()?;
let el = doc.select(&sel).next()?;
el.value()
.attr("content")
.or_else(|| el.value().attr("href"))
.map(str::to_string)
}
fn meta_content(doc: &Html, attr: &str, key: &str) -> Option<String> {
let sel = Selector::parse(&format!(r#"meta[{attr}="{key}"]"#)).ok()?;
let v = doc.select(&sel).next()?.value().attr("content")?.trim();
(!v.is_empty()).then(|| v.to_string())
}
fn json_to_price(v: &serde_json::Value) -> Option<Decimal> {
match v {
serde_json::Value::String(s) => parse_price(s),
serde_json::Value::Number(n) => Decimal::from_str(&n.to_string()).ok(),
_ => None,
}
}
/// `image` may be a string, an array of strings, or an `ImageObject`.
fn json_first_string(v: &serde_json::Value) -> Option<String> {
match v {
serde_json::Value::String(s) => Some(s.clone()),
serde_json::Value::Array(a) => a.iter().find_map(json_first_string),
serde_json::Value::Object(o) => o.get("url").and_then(json_first_string),
_ => None,
}
}
/// A schema.org availability URL/string maps to in-stock truth.
fn availability_in_stock(s: &str) -> bool {
let s = s.to_ascii_lowercase();
// Treat InStock / PreOrder / BackOrder as "can be had"; everything else
// (OutOfStock, SoldOut, Discontinued) as not.
s.contains("instock") || s.contains("preorder") || s.contains("backorder")
}
fn normalize_currency(s: &str) -> String {
s.trim().to_uppercase()
}
/// Parse a price string like `17.95`, `1.234,56`, `€ 17,95` into a Decimal.
fn parse_price(raw: &str) -> Option<Decimal> {
// Keep only digits and separators.
let mut s: String = raw
.chars()
.filter(|c| c.is_ascii_digit() || *c == '.' || *c == ',')
.collect();
if s.is_empty() {
return None;
}
match (s.rfind('.'), s.rfind(',')) {
// Both present: the rightmost separator is the decimal point; the other
// is a thousands separator and gets stripped.
(Some(dot), Some(comma)) => {
if comma > dot {
s = s.replace('.', "").replace(',', ".");
} else {
s = s.replace(',', "");
}
}
// Only a comma: it's the decimal separator (European style).
(None, Some(_)) => s = s.replace(',', "."),
// Only a dot, or neither: already parseable.
_ => {}
}
Decimal::from_str(&s).ok()
}
+7 -1
View File
@@ -4,6 +4,7 @@
use rust_decimal::Decimal; use rust_decimal::Decimal;
mod generic;
mod shopify; mod shopify;
/// Normalised product snapshot produced by an adapter. /// Normalised product snapshot produced by an adapter.
@@ -35,5 +36,10 @@ pub async fn fetch_product(
if let Some(p) = shopify::fetch(client, url, default_currency).await? { if let Some(p) = shopify::fetch(client, url, default_currency).await? {
return Ok(p); return Ok(p);
} }
anyhow::bail!("no adapter could read this URL (only Shopify storefronts are supported for now)") // Catch-all: read standard product metadata (JSON-LD / microdata / OG) from
// the page HTML. Works on any store emitting schema.org markup.
if let Some(p) = generic::fetch(client, url, default_currency).await? {
return Ok(p);
}
anyhow::bail!("no adapter could read this URL (no Shopify API and no readable product metadata)")
} }