|
2 | 2 | "cells": [ |
3 | 3 | { |
4 | 4 | "cell_type": "code", |
5 | | - "execution_count": 1, |
| 5 | + "execution_count": null, |
6 | 6 | "id": "b637b442-3873-4c23-9159-7a4f00b98e86", |
7 | 7 | "metadata": {}, |
8 | | - "outputs": [ |
9 | | - { |
10 | | - "data": { |
11 | | - "application/vnd.jupyter.widget-view+json": { |
12 | | - "model_id": "7a52c853f3d348a490288979f5947d6e", |
13 | | - "version_major": 2, |
14 | | - "version_minor": 0 |
15 | | - }, |
16 | | - "text/plain": [ |
17 | | - "FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))" |
18 | | - ] |
19 | | - }, |
20 | | - "metadata": {}, |
21 | | - "output_type": "display_data" |
22 | | - }, |
23 | | - { |
24 | | - "data": { |
25 | | - "application/vnd.jupyter.widget-view+json": { |
26 | | - "model_id": "2607eeec41a24eb8afec4c73143d41e9", |
27 | | - "version_major": 2, |
28 | | - "version_minor": 0 |
29 | | - }, |
30 | | - "text/plain": [ |
31 | | - "FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))" |
32 | | - ] |
33 | | - }, |
34 | | - "metadata": {}, |
35 | | - "output_type": "display_data" |
36 | | - }, |
37 | | - { |
38 | | - "name": "stdout", |
39 | | - "output_type": "stream", |
40 | | - "text": [ |
41 | | - "DuckDB database created at repos.duckdb\n" |
42 | | - ] |
43 | | - } |
44 | | - ], |
| 8 | + "outputs": [], |
45 | 9 | "source": [ |
46 | 10 | "import duckdb\n", |
47 | 11 | "import pandas as pd\n", |
|
102 | 66 | " \"size\": lang.get(\"size\")\n", |
103 | 67 | " })\n", |
104 | 68 | "\n", |
| 69 | + " # Use a set to track seen topic names per repo\n", |
| 70 | + " seen_topics = set()\n", |
105 | 71 | " for topic in repo.get(\"topics\", []):\n", |
106 | | - " topics.append({\n", |
107 | | - " \"repo\": repo_id,\n", |
108 | | - " \"topic\": topic.get(\"name\"),\n", |
109 | | - " \"stars\": topic.get(\"stars\")\n", |
110 | | - " })\n", |
| 72 | + " topic_name = topic.get(\"name\")\n", |
| 73 | + " if topic_name and topic_name not in seen_topics:\n", |
| 74 | + " seen_topics.add(topic_name)\n", |
| 75 | + " topics.append({\n", |
| 76 | + " \"repo\": repo_id,\n", |
| 77 | + " \"topic\": topic_name,\n", |
| 78 | + " \"stars\": topic.get(\"stars\")\n", |
| 79 | + " })\n", |
111 | 80 | "\n", |
112 | 81 | "# Convert to DataFrames\n", |
113 | 82 | "df_repos = pd.DataFrame(repos)\n", |
|
129 | 98 | "\n", |
130 | 99 | "con.close()\n", |
131 | 100 | "\n", |
132 | | - "print(f\"DuckDB database created at {db_path}\")\n" |
| 101 | + "print(f\"DuckDB database created at {db_path}\")" |
133 | 102 | ] |
134 | | - }, |
135 | | - { |
136 | | - "cell_type": "code", |
137 | | - "execution_count": null, |
138 | | - "id": "2442785a-d821-47ab-bf16-86ceab3af02b", |
139 | | - "metadata": {}, |
140 | | - "outputs": [], |
141 | | - "source": [] |
142 | 103 | } |
143 | 104 | ], |
144 | 105 | "metadata": { |
|
0 commit comments