|
1 | 1 | import consola from "consola"; |
2 | | -import { parseHTML } from "linkedom"; |
| 2 | +import { HTMLAnchorElement, HTMLElement, parseHTML } from "linkedom"; |
3 | 3 |
|
4 | 4 | export async function crawlExtension( |
5 | 5 | id: string, |
6 | 6 | lang: string |
7 | 7 | ): Promise<Gql.ChromeExtension | undefined> { |
8 | 8 | consola.info("Crawling " + id); |
9 | 9 | const url = `https://chromewebstore.google.com/detail/${id}?hl=${lang}`; |
10 | | - const res = await fetch(url); |
| 10 | + const res = await fetch(url, { |
| 11 | + headers: { |
| 12 | + // Without a user agent, the request is stuck in a 302 redirect loop |
| 13 | + "User-Agent": |
| 14 | + // Firefox: |
| 15 | + "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:123.0) Gecko/20100101 Firefox/123.0", |
| 16 | + }, |
| 17 | + }); |
11 | 18 | if (res.status !== 200) return; |
12 | 19 |
|
13 | 20 | const html = await res.text(); |
14 | 21 | const { document } = parseHTML(html); |
15 | 22 |
|
16 | 23 | // Uncomment to debug HTML |
17 | | - // consola.info(document.documentElement.outerHTML); |
| 24 | + // Bun.write("chrome.html", document.documentElement.outerHTML); |
18 | 25 |
|
19 | | - const name = metaContent(document, "itemprop=name"); |
20 | | - const storeUrl = metaContent(document, "itemprop=url"); |
21 | | - const iconUrl = metaContent(document, "itemprop=image")?.replace( |
| 26 | + // Basic metadata |
| 27 | + const name = metaContent(document, "property=og:title"); |
| 28 | + const storeUrl = metaContent(document, "property=og:url"); |
| 29 | + const iconUrl = metaContent(document, "property=og:image")?.replace( |
22 | 30 | /=.+?$/, |
23 | 31 | "=s256" |
24 | 32 | ); |
25 | | - const weeklyActiveUsers = metaContent(document, "itemprop=interactionCount") |
26 | | - // "UserDownloads:XYZ+" |
27 | | - ?.replace("UserDownloads:", "") |
28 | | - .replace(",", "") |
29 | | - .replace("+", ""); |
30 | | - const lastUpdated = nextSpanText(document, "Updated:"); |
31 | | - const version = metaContent(document, "itemprop=version"); |
32 | 33 | const shortDescription = metaContent(document, "property=og:description"); |
33 | | - const longDescription = document |
34 | | - .querySelector("div[itemprop=description]") |
35 | | - ?.nextElementSibling?.textContent?.trim(); |
36 | 34 |
|
37 | | - const ratingDiv = document.querySelector(".rsw-stars"); |
38 | | - const rating = extractNumber(ratingDiv.title); // "Average rating: 4.78 stars" |
39 | | - const reviewCount = extractNumber(ratingDiv.textContent); // "(1024)" |
| 35 | + // Grab the main sections that contain content |
| 36 | + const sections = (document as HTMLElement).querySelectorAll( |
| 37 | + "main > * > section" |
| 38 | + ); |
| 39 | + const header: HTMLElement = sections[0]; |
| 40 | + const description: HTMLElement = sections[2]; |
| 41 | + const details: HTMLElement = sections[3]; |
| 42 | + |
| 43 | + // Header |
| 44 | + |
| 45 | + // userRowCount.outerHTHML: |
| 46 | + // <div> |
| 47 | + // <a>...</a> |
| 48 | + // <a>...</a> |
| 49 | + // 73 users |
| 50 | + // </div> |
| 51 | + // Remove the anchors and extract "73" from the text content |
| 52 | + const userCountRow = header.querySelector("div:first-child > div:last-child"); |
| 53 | + userCountRow |
| 54 | + .querySelectorAll("a") |
| 55 | + .forEach((anchor: HTMLAnchorElement) => anchor.remove()); |
| 56 | + const weeklyActiveUsers = (userCountRow.textContent as string) |
| 57 | + // "XYZ+ users" |
| 58 | + .replace(" users", "") |
| 59 | + .replace(",", "") |
| 60 | + .replace("+", "") |
| 61 | + .trim(); |
| 62 | + |
| 63 | + // ratingRow.outerHTML: |
| 64 | + // <span> |
| 65 | + // <span> |
| 66 | + // <span>5.0</span> |
| 67 | + // <svg ><path /></svg> |
| 68 | + // <span>(<a><p>2 ratings</p></a>)</span> |
| 69 | + // </span> |
| 70 | + // </span> |
| 71 | + const ratingRow = header.querySelector( |
| 72 | + "div:first-child > div:nth-child(2) > span:last-child" |
| 73 | + ); |
| 74 | + const rating = extractNumber( |
| 75 | + ratingRow.querySelector("span:first-child > span:first-child").textContent |
| 76 | + ); |
| 77 | + const reviewCount = extractNumber(ratingRow.querySelector("p").textContent); |
| 78 | + |
| 79 | + // Details |
| 80 | + |
| 81 | + const detailItems = details.querySelectorAll("li > div:last-child"); |
| 82 | + const version = detailItems[0].textContent.trim(); |
| 83 | + const lastUpdated = detailItems[1].textContent.trim(); |
| 84 | + |
| 85 | + // Description |
| 86 | + |
| 87 | + const longDescription = description |
| 88 | + .querySelector("p:last-child") |
| 89 | + .textContent.replaceAll("\n\n", "\n"); |
| 90 | + |
| 91 | + // const longDescription = document |
| 92 | + // .querySelector("div[itemprop=description]") |
| 93 | + // ?.nextElementSibling?.textContent?.trim(); |
| 94 | + // |
| 95 | + // const ratingDiv = document.querySelector(".rsw-stars"); |
| 96 | + // const rating = extractNumber(ratingDiv.title); // "Average rating: 4.78 stars" |
| 97 | + // const reviewCount = extractNumber(ratingDiv.textContent); // "(1024)" |
40 | 98 |
|
41 | 99 | if (name == null) return; |
42 | 100 | if (storeUrl == null) return; |
|
0 commit comments