Add scraper

This commit is contained in:
Myriad-Dreamin 2025-05-28 21:13:56 +08:00
parent 9a95966302
commit 8311997274
3 changed files with 218 additions and 0 deletions

216
Cargo.lock generated
View File

@ -546,6 +546,29 @@ version = "0.2.3"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "43da5946c66ffcc7745f48db692ffbb10a83bfe0afd96235c5c2a4fb23994929" checksum = "43da5946c66ffcc7745f48db692ffbb10a83bfe0afd96235c5c2a4fb23994929"
[[package]]
name = "cssparser"
version = "0.34.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b7c66d1cd8ed61bf80b38432613a7a2f09401ab8d0501110655f8b341484a3e3"
dependencies = [
"cssparser-macros",
"dtoa-short",
"itoa",
"phf",
"smallvec",
]
[[package]]
name = "cssparser-macros"
version = "0.6.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "13b588ba4ac1a99f7f2964d24b3d896ddc6bf847ee3855dbd4366f058cfcd331"
dependencies = [
"quote",
"syn",
]
[[package]] [[package]]
name = "csv" name = "csv"
version = "1.3.1" version = "1.3.1"
@ -593,6 +616,17 @@ dependencies = [
"syn", "syn",
] ]
[[package]]
name = "derive_more"
version = "0.99.20"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "6edb4b64a43d977b8e99788fe3a04d483834fba1215a7e02caa415b626497f7f"
dependencies = [
"proc-macro2",
"quote",
"syn",
]
[[package]] [[package]]
name = "dirs" name = "dirs"
version = "6.0.0" version = "6.0.0"
@ -631,6 +665,21 @@ version = "1.2.1"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "75b325c5dbd37f80359721ad39aca5a29fb04c89279657cffdda8736d0c0b9d2" checksum = "75b325c5dbd37f80359721ad39aca5a29fb04c89279657cffdda8736d0c0b9d2"
[[package]]
name = "dtoa"
version = "1.0.10"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d6add3b8cff394282be81f3fc1a0605db594ed69890078ca6e2cab1c408bcf04"
[[package]]
name = "dtoa-short"
version = "0.3.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "cd1511a7b6a56299bd043a9c167a6d2bfb37bf84a6dfceaba651168adfb43c87"
dependencies = [
"dtoa",
]
[[package]] [[package]]
name = "ecow" name = "ecow"
version = "0.2.3" version = "0.2.3"
@ -640,6 +689,12 @@ dependencies = [
"serde", "serde",
] ]
[[package]]
name = "ego-tree"
version = "0.10.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b2972feb8dffe7bc8c5463b1dacda1b0dfbed3710e50f977d965429692d74cd8"
[[package]] [[package]]
name = "either" name = "either"
version = "1.13.0" version = "1.13.0"
@ -862,6 +917,16 @@ version = "2.0.0"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "e6d5a32815ae3f33302d95fdcb2ce17862f8c65363dcfd29360480ba1001fc9c" checksum = "e6d5a32815ae3f33302d95fdcb2ce17862f8c65363dcfd29360480ba1001fc9c"
[[package]]
name = "futf"
version = "0.1.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "df420e2e84819663797d1ec6544b13c5be84629e7bb00dc960d6917db2987843"
dependencies = [
"mac",
"new_debug_unreachable",
]
[[package]] [[package]]
name = "fxhash" name = "fxhash"
version = "0.2.1" version = "0.2.1"
@ -971,6 +1036,18 @@ version = "0.5.0"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "2304e00983f87ffb38b55b444b5e3b60a884b5d30c0fca7d82fe33449bbe55ea" checksum = "2304e00983f87ffb38b55b444b5e3b60a884b5d30c0fca7d82fe33449bbe55ea"
[[package]]
name = "html5ever"
version = "0.29.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "3b7410cae13cbc75623c98ac4cbfd1f0bedddf3227afc24f370cf0f50a44a11c"
dependencies = [
"log",
"mac",
"markup5ever",
"match_token",
]
[[package]] [[package]]
name = "httpdate" name = "httpdate"
version = "1.0.3" version = "1.0.3"
@ -1542,6 +1619,37 @@ dependencies = [
"pkg-config", "pkg-config",
] ]
[[package]]
name = "mac"
version = "0.1.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "c41e0c4fef86961ac6d6f8a82609f55f31b05e4fce149ac5710e439df7619ba4"
[[package]]
name = "markup5ever"
version = "0.14.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "c7a7213d12e1864c0f002f52c2923d4556935a43dec5e71355c2760e0f6e7a18"
dependencies = [
"log",
"phf",
"phf_codegen",
"string_cache",
"string_cache_codegen",
"tendril",
]
[[package]]
name = "match_token"
version = "0.1.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "88a9689d8d44bf9964484516275f5cd4c9b59457a6940c1d5d0ecbb94510a36b"
dependencies = [
"proc-macro2",
"quote",
"syn",
]
[[package]] [[package]]
name = "memchr" name = "memchr"
version = "2.7.4" version = "2.7.4"
@ -1614,6 +1722,12 @@ dependencies = [
"tempfile", "tempfile",
] ]
[[package]]
name = "new_debug_unreachable"
version = "1.0.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "650eef8c711430f1a879fdd01d4745a7deea475becfb90269c06775983bbf086"
[[package]] [[package]]
name = "nom" name = "nom"
version = "7.1.3" version = "7.1.3"
@ -1871,6 +1985,16 @@ dependencies = [
"phf_shared", "phf_shared",
] ]
[[package]]
name = "phf_codegen"
version = "0.11.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "aef8048c789fa5e851558d709946d6d79a8ff88c0440c587967f8e94bfb1216a"
dependencies = [
"phf_generator",
"phf_shared",
]
[[package]] [[package]]
name = "phf_generator" name = "phf_generator"
version = "0.11.3" version = "0.11.3"
@ -1983,6 +2107,12 @@ dependencies = [
"zerocopy", "zerocopy",
] ]
[[package]]
name = "precomputed-hash"
version = "0.1.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "925383efa346730478fb4838dbe9137d2a47675ad789c546d150a6e1dd4ab31c"
[[package]] [[package]]
name = "proc-macro2" name = "proc-macro2"
version = "1.0.93" version = "1.0.93"
@ -2284,6 +2414,21 @@ version = "1.2.0"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "94143f37725109f92c262ed2cf5e59bce7498c01bcc1502d7b9afe439a4e9f49" checksum = "94143f37725109f92c262ed2cf5e59bce7498c01bcc1502d7b9afe439a4e9f49"
[[package]]
name = "scraper"
version = "0.23.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "527e65d9d888567588db4c12da1087598d0f6f8b346cc2c5abc91f05fc2dffe2"
dependencies = [
"cssparser",
"ego-tree",
"getopts",
"html5ever",
"precomputed-hash",
"selectors",
"tendril",
]
[[package]] [[package]]
name = "security-framework" name = "security-framework"
version = "2.11.1" version = "2.11.1"
@ -2307,6 +2452,25 @@ dependencies = [
"libc", "libc",
] ]
[[package]]
name = "selectors"
version = "0.26.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "fd568a4c9bb598e291a08244a5c1f5a8a6650bee243b5b0f8dbb3d9cc1d87fe8"
dependencies = [
"bitflags 2.8.0",
"cssparser",
"derive_more",
"fxhash",
"log",
"new_debug_unreachable",
"phf",
"phf_codegen",
"precomputed-hash",
"servo_arc",
"smallvec",
]
[[package]] [[package]]
name = "self-replace" name = "self-replace"
version = "1.5.0" version = "1.5.0"
@ -2390,6 +2554,15 @@ dependencies = [
"unsafe-libyaml", "unsafe-libyaml",
] ]
[[package]]
name = "servo_arc"
version = "0.4.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "ae65c4249478a2647db249fb43e23cec56a2c8974a427e7bd8cb5a1d0964921a"
dependencies = [
"stable_deref_trait",
]
[[package]] [[package]]
name = "shell-escape" name = "shell-escape"
version = "0.1.5" version = "0.1.5"
@ -2501,6 +2674,31 @@ dependencies = [
"serde", "serde",
] ]
[[package]]
name = "string_cache"
version = "0.8.9"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "bf776ba3fa74f83bf4b63c3dcbbf82173db2632ed8452cb2d891d33f459de70f"
dependencies = [
"new_debug_unreachable",
"parking_lot",
"phf_shared",
"precomputed-hash",
"serde",
]
[[package]]
name = "string_cache_codegen"
version = "0.5.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "c711928715f1fe0fe509c53b43e993a9a557babc2d0a3567d0a3006f1ac931a0"
dependencies = [
"phf_generator",
"phf_shared",
"proc-macro2",
"quote",
]
[[package]] [[package]]
name = "strsim" name = "strsim"
version = "0.11.1" version = "0.11.1"
@ -2623,6 +2821,17 @@ dependencies = [
"windows-sys 0.59.0", "windows-sys 0.59.0",
] ]
[[package]]
name = "tendril"
version = "0.4.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d24a120c5fc464a3458240ee02c299ebcb9d67b5249c8848b09d639dca8d7bb0"
dependencies = [
"futf",
"mac",
"utf-8",
]
[[package]] [[package]]
name = "termcolor" name = "termcolor"
version = "1.4.1" version = "1.4.1"
@ -3089,6 +3298,7 @@ dependencies = [
"roxmltree", "roxmltree",
"rust_decimal", "rust_decimal",
"rustybuzz", "rustybuzz",
"scraper",
"serde", "serde",
"serde_json", "serde_json",
"serde_yaml 0.9.34+deprecated", "serde_yaml 0.9.34+deprecated",
@ -3419,6 +3629,12 @@ dependencies = [
"xmlwriter", "xmlwriter",
] ]
[[package]]
name = "utf-8"
version = "0.7.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "09cc8ee72d2a9becf2f2febe0205bbed8fc6615b7cb429ad062dc7b7ddd036a9"
[[package]] [[package]]
name = "utf16_iter" name = "utf16_iter"
version = "1.0.5" version = "1.0.5"

View File

@ -104,6 +104,7 @@ roxmltree = "0.20"
rust_decimal = { version = "1.36.0", default-features = false, features = ["maths"] } rust_decimal = { version = "1.36.0", default-features = false, features = ["maths"] }
rustybuzz = "0.20" rustybuzz = "0.20"
same-file = "1" same-file = "1"
scraper = "0.23.1"
self-replace = "1.3.7" self-replace = "1.3.7"
semver = "1" semver = "1"
serde = { version = "1.0.184", features = ["derive"] } serde = { version = "1.0.184", features = ["derive"] }

View File

@ -50,6 +50,7 @@ regex-syntax = { workspace = true }
roxmltree = { workspace = true } roxmltree = { workspace = true }
rust_decimal = { workspace = true } rust_decimal = { workspace = true }
rustybuzz = { workspace = true } rustybuzz = { workspace = true }
scraper = { workspace = true }
serde = { workspace = true } serde = { workspace = true }
serde_json = { workspace = true } serde_json = { workspace = true }
serde_yaml = { workspace = true } serde_yaml = { workspace = true }