Skip to content

Commit 3adc34e

Browse files
committed
use static DB instead
1 parent 3f09da6 commit 3adc34e

8 files changed

Lines changed: 2671 additions & 87 deletions

File tree

.gitignore

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10,4 +10,5 @@ dist
1010
.DS_Store
1111
*temp
1212
*repo_metadata.json
13-
__pycache__
13+
__pycache__
14+
*.duckdb

backend/app/services/topic_service.py

Lines changed: 56 additions & 36 deletions
Original file line numberDiff line numberDiff line change
@@ -1,62 +1,82 @@
11
from collections import Counter
22
import duckdb
33
from app.utils.cache import get_cached_topics, save_cached_topics
4+
import os
45

56
class TopicService:
7+
# Define the allowed terms for caching
8+
CACHEABLE_TERMS = {
9+
"visual-programming",
10+
"machine-learning",
11+
"logic-programming",
12+
"large-language-models"
13+
}
14+
615
def __init__(self):
7-
self.con = duckdb.connect(database=':memory:')
8-
self.con.execute("SET threads TO 16;")
9-
self.con.execute("""
10-
CREATE TEMP TABLE repo AS
11-
SELECT * FROM read_json_auto('../public/data/repo_metadata.json');
12-
""")
16+
db_path = '../public/data/github_meta.duckdb'
17+
18+
# Check if database exists
19+
if os.path.exists(db_path):
20+
# Connect in read-only mode to avoid locking issues
21+
self.con = duckdb.connect(database=db_path, read_only=True)
22+
self.con.execute("SET threads TO 8;")
23+
else:
24+
raise FileNotFoundError(
25+
f"Database not found at {db_path}. Please ensure the database file exists before running the application."
26+
)
1327

1428
def process_topics(self, search_term: str):
1529
try:
1630
search_term = search_term.lower()
1731

18-
# Check cache
19-
cached_result = get_cached_topics(search_term)
20-
if cached_result:
21-
return {
22-
"success": True,
23-
"data": cached_result,
24-
"total": len(cached_result),
25-
"cached": True
26-
}
32+
# Only check cache for allowed terms
33+
if search_term in self.CACHEABLE_TERMS:
34+
cached_result = get_cached_topics(search_term)
35+
if cached_result:
36+
return {
37+
"success": True,
38+
"data": cached_result,
39+
"total": len(cached_result),
40+
"cached": True
41+
}
2742

28-
# Get data from DuckDB
29-
query = "SELECT nameWithOwner, topics FROM repo"
43+
# Get data from normalized tables in DuckDB
44+
query = """
45+
SELECT r.nameWithOwner, t.topic
46+
FROM repos r
47+
JOIN repo_topics t ON r.nameWithOwner = t.repo
48+
"""
3049
df = self.con.execute(query).fetchdf()
31-
32-
# Process topics
33-
def extract_names(item_ls):
34-
if item_ls is not None and len(item_ls) > 0:
35-
return [item["name"] for item in item_ls if "name" in item]
36-
return []
37-
38-
df["topics"] = df["topics"].apply(extract_names)
39-
filtered_df = df[df["topics"].apply(lambda x: search_term in [t.lower() for t in x])]
40-
41-
# Count topics
50+
51+
# Group topics by repo into a list
52+
grouped = df.groupby("nameWithOwner")["topic"].apply(list).reset_index()
53+
grouped.columns = ["nameWithOwner", "topics"]
54+
55+
# Filter repos based on search term in topics
56+
filtered_df = grouped[grouped["topics"].apply(lambda x: search_term in [t.lower() for t in x])]
57+
58+
# Count all co-occurring topics
4259
all_topics = [topic for topics in filtered_df["topics"] for topic in topics]
43-
topic_counts = Counter(all_topics)
60+
topic_counts = Counter([t.lower() for t in all_topics])
61+
62+
# Remove the searched topic itself
4463
topic_counts.pop(search_term, None)
45-
46-
# Format results
64+
65+
# Format results and sort, only including topics with count > 2
4766
topics = [{"name": name, "count": count} for name, count in topic_counts.items() if count > 2]
4867
topics = sorted(topics, key=lambda x: x["count"], reverse=True)
49-
50-
# Cache results
51-
save_cached_topics(search_term, topics)
52-
68+
69+
# Only cache results for allowed terms
70+
if search_term in self.CACHEABLE_TERMS:
71+
save_cached_topics(search_term, topics)
72+
5373
return {
5474
"success": True,
5575
"data": topics,
5676
"total": len(topics),
5777
"cached": False
5878
}
59-
79+
6080
except Exception as e:
6181
return {
6282
"success": False,

backend/requirements.txt

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,4 +4,5 @@ duckdb
44
google-generativeai
55
openai
66
fastapi
7-
uvicorn
7+
uvicorn
8+
kaggle>=1.5.16

backend/test/duckdb_create.ipynb

Lines changed: 165 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,165 @@
1+
{
2+
"cells": [
3+
{
4+
"cell_type": "code",
5+
"execution_count": 1,
6+
"id": "b637b442-3873-4c23-9159-7a4f00b98e86",
7+
"metadata": {},
8+
"outputs": [
9+
{
10+
"data": {
11+
"application/vnd.jupyter.widget-view+json": {
12+
"model_id": "7a52c853f3d348a490288979f5947d6e",
13+
"version_major": 2,
14+
"version_minor": 0
15+
},
16+
"text/plain": [
17+
"FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))"
18+
]
19+
},
20+
"metadata": {},
21+
"output_type": "display_data"
22+
},
23+
{
24+
"data": {
25+
"application/vnd.jupyter.widget-view+json": {
26+
"model_id": "2607eeec41a24eb8afec4c73143d41e9",
27+
"version_major": 2,
28+
"version_minor": 0
29+
},
30+
"text/plain": [
31+
"FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))"
32+
]
33+
},
34+
"metadata": {},
35+
"output_type": "display_data"
36+
},
37+
{
38+
"name": "stdout",
39+
"output_type": "stream",
40+
"text": [
41+
"DuckDB database created at repos.duckdb\n"
42+
]
43+
}
44+
],
45+
"source": [
46+
"import duckdb\n",
47+
"import pandas as pd\n",
48+
"import json\n",
49+
"import os\n",
50+
"\n",
51+
"# Path to your JSON file\n",
52+
"json_path = '../../public/data/repo_metadata.json' # Replace with your actual file path\n",
53+
"db_path = 'github_metadata.duckdb' # Output DB file\n",
54+
"\n",
55+
"# Load the JSON data\n",
56+
"with open(json_path, 'r') as f:\n",
57+
" data = json.load(f)\n",
58+
"\n",
59+
"# Ensure data is a list of repos\n",
60+
"if isinstance(data, dict):\n",
61+
" data = [data]\n",
62+
"\n",
63+
"# Flatten top-level fields\n",
64+
"repos = []\n",
65+
"languages = []\n",
66+
"topics = []\n",
67+
"\n",
68+
"for repo in data:\n",
69+
" repo_id = repo.get(\"nameWithOwner\")\n",
70+
"\n",
71+
" # Flatten main repo info\n",
72+
" repos.append({\n",
73+
" \"nameWithOwner\": repo_id,\n",
74+
" \"owner\": repo.get(\"owner\"),\n",
75+
" \"name\": repo.get(\"name\"),\n",
76+
" \"stars\": repo.get(\"stars\"),\n",
77+
" \"forks\": repo.get(\"forks\"),\n",
78+
" \"watchers\": repo.get(\"watchers\"),\n",
79+
" \"isFork\": repo.get(\"isFork\"),\n",
80+
" \"isArchived\": repo.get(\"isArchived\"),\n",
81+
" \"languageCount\": repo.get(\"languageCount\"),\n",
82+
" \"topicCount\": repo.get(\"topicCount\"),\n",
83+
" \"diskUsageKb\": repo.get(\"diskUsageKb\"),\n",
84+
" \"pullRequests\": repo.get(\"pullRequests\"),\n",
85+
" \"issues\": repo.get(\"issues\"),\n",
86+
" \"description\": repo.get(\"description\"),\n",
87+
" \"primaryLanguage\": repo.get(\"primaryLanguage\"),\n",
88+
" \"createdAt\": repo.get(\"createdAt\"),\n",
89+
" \"pushedAt\": repo.get(\"pushedAt\"),\n",
90+
" \"defaultBranchCommitCount\": repo.get(\"defaultBranchCommitCount\"),\n",
91+
" \"license\": repo.get(\"license\"),\n",
92+
" \"assignableUserCount\": repo.get(\"assignableUserCount\"),\n",
93+
" \"codeOfConduct\": repo.get(\"codeOfConduct\"),\n",
94+
" \"forkingAllowed\": repo.get(\"forkingAllowed\"),\n",
95+
" \"parent\": repo.get(\"parent\")\n",
96+
" })\n",
97+
"\n",
98+
" for lang in repo.get(\"languages\", []):\n",
99+
" languages.append({\n",
100+
" \"repo\": repo_id,\n",
101+
" \"language\": lang.get(\"name\"),\n",
102+
" \"size\": lang.get(\"size\")\n",
103+
" })\n",
104+
"\n",
105+
" for topic in repo.get(\"topics\", []):\n",
106+
" topics.append({\n",
107+
" \"repo\": repo_id,\n",
108+
" \"topic\": topic.get(\"name\"),\n",
109+
" \"stars\": topic.get(\"stars\")\n",
110+
" })\n",
111+
"\n",
112+
"# Convert to DataFrames\n",
113+
"df_repos = pd.DataFrame(repos)\n",
114+
"df_languages = pd.DataFrame(languages)\n",
115+
"df_topics = pd.DataFrame(topics)\n",
116+
"\n",
117+
"# Create DuckDB and insert tables\n",
118+
"con = duckdb.connect(db_path)\n",
119+
"con.execute(\"SET threads TO 4;\")\n",
120+
"\n",
121+
"con.register('df_repos', df_repos)\n",
122+
"con.execute(\"CREATE OR REPLACE TABLE repos AS SELECT * FROM df_repos;\")\n",
123+
"\n",
124+
"con.register('df_languages', df_languages)\n",
125+
"con.execute(\"CREATE OR REPLACE TABLE repo_languages AS SELECT * FROM df_languages;\")\n",
126+
"\n",
127+
"con.register('df_topics', df_topics)\n",
128+
"con.execute(\"CREATE OR REPLACE TABLE repo_topics AS SELECT * FROM df_topics;\")\n",
129+
"\n",
130+
"con.close()\n",
131+
"\n",
132+
"print(f\"DuckDB database created at {db_path}\")\n"
133+
]
134+
},
135+
{
136+
"cell_type": "code",
137+
"execution_count": null,
138+
"id": "2442785a-d821-47ab-bf16-86ceab3af02b",
139+
"metadata": {},
140+
"outputs": [],
141+
"source": []
142+
}
143+
],
144+
"metadata": {
145+
"kernelspec": {
146+
"display_name": "Python 3 (ipykernel)",
147+
"language": "python",
148+
"name": "python3"
149+
},
150+
"language_info": {
151+
"codemirror_mode": {
152+
"name": "ipython",
153+
"version": 3
154+
},
155+
"file_extension": ".py",
156+
"mimetype": "text/x-python",
157+
"name": "python",
158+
"nbconvert_exporter": "python",
159+
"pygments_lexer": "ipython3",
160+
"version": "3.10.13"
161+
}
162+
},
163+
"nbformat": 4,
164+
"nbformat_minor": 5
165+
}

0 commit comments

Comments
 (0)