Skip to content

Commit ffa4104

Browse files
committed
fix: Crawl new chromewebstore.google.com page
1 parent c05d721 commit ffa4104

4 files changed

Lines changed: 116 additions & 20 deletions

File tree

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,26 @@
1+
// Bun Snapshot v1, https://goo.gl/fbAQLP
2+
3+
exports[`Chrome Web Store Crawler should load and crawl an extension ID correctly 1`] = `
4+
{
5+
"iconUrl": "https://lh3.googleusercontent.com/GcffNyCJaxT2G9dsQCJHhUEMlu_E0vEzph5cLPrQj7UHKat7QyCzGu69Dmp_DDUL8rY-bPMFJceQarS1wcqdwTalTg=s256",
6+
"id": "ocfdgncpifmegplaglcnglhioflaimkd",
7+
"lastUpdated": "February 4, 2024",
8+
"longDescription":
9+
"Isn't it annoying when you open a small PR, but when you look at the diff, it's +2000 -16 because you installed a new library? Or what if you had to review that PR, don't line counts like that dissuade you from starting the review?
10+
In reality, lots of code is generated nowadays and GitHub's line counts are not representative of a PR's true size.
11+
This extension subtracts generated files from the total line counts, giving you a better idea of how big a PR really is. That's it. That's all it does.
12+
Generated files are detected from the branch's root .gitattributes file. See GitHub's docs to learn how to mark a file as generated: https://docs.github.com/en/repositories/working-with-files/managing-files/customizing-how-changed-files-appear-on-github
13+
For a simple example, checkout this extension's .gitattributes file! https://github.com/aklinker1/github-better-line-counts/blob/main/.gitattributes
14+
---
15+
The extension is open source. Feel free to contribute if you have any ideas or just star it 😀
16+
https://github.com/aklinker1/github-better-line-counts"
17+
,
18+
"name": "GitHub: Better Line Counts",
19+
"rating": 5,
20+
"reviewCount": 2,
21+
"shortDescription": "Remove generated files from GitHub line counts",
22+
"storeUrl": "https://chromewebstore.google.com/detail/github-better-line-counts/ocfdgncpifmegplaglcnglhioflaimkd",
23+
"version": "1.7.1",
24+
"weeklyActiveUsers": 73,
25+
}
26+
`;
Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,12 @@
1+
import { describe, expect, it } from "bun:test";
2+
import { crawlExtension } from "../chrome-crawler";
3+
4+
const githubBetterLineCountsId = "ocfdgncpifmegplaglcnglhioflaimkd";
5+
6+
describe("Chrome Web Store Crawler", () => {
7+
it("should load and crawl an extension ID correctly", async () => {
8+
const res = await crawlExtension(githubBetterLineCountsId, "en");
9+
10+
expect(res).toMatchSnapshot();
11+
});
12+
});

src/crawlers/chrome-crawler.ts

Lines changed: 77 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -1,42 +1,100 @@
11
import consola from "consola";
2-
import { parseHTML } from "linkedom";
2+
import { HTMLAnchorElement, HTMLElement, parseHTML } from "linkedom";
33

44
export async function crawlExtension(
55
id: string,
66
lang: string
77
): Promise<Gql.ChromeExtension | undefined> {
88
consola.info("Crawling " + id);
99
const url = `https://chromewebstore.google.com/detail/${id}?hl=${lang}`;
10-
const res = await fetch(url);
10+
const res = await fetch(url, {
11+
headers: {
12+
// Without a user agent, the request is stuck in a 302 redirect loop
13+
"User-Agent":
14+
// Firefox:
15+
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:123.0) Gecko/20100101 Firefox/123.0",
16+
},
17+
});
1118
if (res.status !== 200) return;
1219

1320
const html = await res.text();
1421
const { document } = parseHTML(html);
1522

1623
// Uncomment to debug HTML
17-
// consola.info(document.documentElement.outerHTML);
24+
// Bun.write("chrome.html", document.documentElement.outerHTML);
1825

19-
const name = metaContent(document, "itemprop=name");
20-
const storeUrl = metaContent(document, "itemprop=url");
21-
const iconUrl = metaContent(document, "itemprop=image")?.replace(
26+
// Basic metadata
27+
const name = metaContent(document, "property=og:title");
28+
const storeUrl = metaContent(document, "property=og:url");
29+
const iconUrl = metaContent(document, "property=og:image")?.replace(
2230
/=.+?$/,
2331
"=s256"
2432
);
25-
const weeklyActiveUsers = metaContent(document, "itemprop=interactionCount")
26-
// "UserDownloads:XYZ+"
27-
?.replace("UserDownloads:", "")
28-
.replace(",", "")
29-
.replace("+", "");
30-
const lastUpdated = nextSpanText(document, "Updated:");
31-
const version = metaContent(document, "itemprop=version");
3233
const shortDescription = metaContent(document, "property=og:description");
33-
const longDescription = document
34-
.querySelector("div[itemprop=description]")
35-
?.nextElementSibling?.textContent?.trim();
3634

37-
const ratingDiv = document.querySelector(".rsw-stars");
38-
const rating = extractNumber(ratingDiv.title); // "Average rating: 4.78 stars"
39-
const reviewCount = extractNumber(ratingDiv.textContent); // "(1024)"
35+
// Grab the main sections that contain content
36+
const sections = (document as HTMLElement).querySelectorAll(
37+
"main > * > section"
38+
);
39+
const header: HTMLElement = sections[0];
40+
const description: HTMLElement = sections[2];
41+
const details: HTMLElement = sections[3];
42+
43+
// Header
44+
45+
// userRowCount.outerHTHML:
46+
// <div>
47+
// <a>...</a>
48+
// <a>...</a>
49+
// 73 users
50+
// </div>
51+
// Remove the anchors and extract "73" from the text content
52+
const userCountRow = header.querySelector("div:first-child > div:last-child");
53+
userCountRow
54+
.querySelectorAll("a")
55+
.forEach((anchor: HTMLAnchorElement) => anchor.remove());
56+
const weeklyActiveUsers = (userCountRow.textContent as string)
57+
// "XYZ+ users"
58+
.replace(" users", "")
59+
.replace(",", "")
60+
.replace("+", "")
61+
.trim();
62+
63+
// ratingRow.outerHTML:
64+
// <span>
65+
// <span>
66+
// <span>5.0</span>
67+
// <svg ><path /></svg>
68+
// <span>(<a><p>2 ratings</p></a>)</span>
69+
// </span>
70+
// </span>
71+
const ratingRow = header.querySelector(
72+
"div:first-child > div:nth-child(2) > span:last-child"
73+
);
74+
const rating = extractNumber(
75+
ratingRow.querySelector("span:first-child > span:first-child").textContent
76+
);
77+
const reviewCount = extractNumber(ratingRow.querySelector("p").textContent);
78+
79+
// Details
80+
81+
const detailItems = details.querySelectorAll("li > div:last-child");
82+
const version = detailItems[0].textContent.trim();
83+
const lastUpdated = detailItems[1].textContent.trim();
84+
85+
// Description
86+
87+
const longDescription = description
88+
.querySelector("p:last-child")
89+
.textContent.replaceAll("\n\n", "\n");
90+
91+
// const longDescription = document
92+
// .querySelector("div[itemprop=description]")
93+
// ?.nextElementSibling?.textContent?.trim();
94+
//
95+
// const ratingDiv = document.querySelector(".rsw-stars");
96+
// const rating = extractNumber(ratingDiv.title); // "Average rating: 4.78 stars"
97+
// const reviewCount = extractNumber(ratingDiv.textContent); // "(1024)"
4098

4199
if (name == null) return;
42100
if (storeUrl == null) return;

src/services/chrome-service.ts

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
import { chrome } from "../crawlers";
2-
import { createCachedDataLoader, createInMemoryCache } from "../utils/cache";
2+
import { createCachedDataLoader } from "../utils/cache";
33
import { DAY_MS } from "../utils/time";
44

55
export function createChromeService() {

0 commit comments

Comments
 (0)