Skip to content

Commit e4763ac

Browse files
committed
add cache
1 parent 15b3e76 commit e4763ac

7 files changed

Lines changed: 503 additions & 29 deletions

File tree

.gitignore

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@ node_modules
55
.env.development.local
66
.env.test.local
77
.env.production.local
8-
*.ipython_checkpoints
8+
.ipynb_checkpoints/
99
dist
1010
.DS_Store
1111
*temp

backend/duckdb_test.ipynb

Lines changed: 100 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,100 @@
1+
{
2+
"cells": [
3+
{
4+
"cell_type": "code",
5+
"execution_count": 14,
6+
"id": "9cef20d5-c51f-4597-afd9-456c7c26f5f4",
7+
"metadata": {},
8+
"outputs": [
9+
{
10+
"data": {
11+
"application/vnd.jupyter.widget-view+json": {
12+
"model_id": "13d1ff9f76744ae397b1c5ca7e8c6498",
13+
"version_major": 2,
14+
"version_minor": 0
15+
},
16+
"text/plain": [
17+
"FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))"
18+
]
19+
},
20+
"metadata": {},
21+
"output_type": "display_data"
22+
},
23+
{
24+
"data": {
25+
"application/vnd.jupyter.widget-view+json": {
26+
"model_id": "a4efdf3931044bdbb61a128cd1496dd5",
27+
"version_major": 2,
28+
"version_minor": 0
29+
},
30+
"text/plain": [
31+
"FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))"
32+
]
33+
},
34+
"metadata": {},
35+
"output_type": "display_data"
36+
}
37+
],
38+
"source": [
39+
"import duckdb\n",
40+
"\n",
41+
"search_term = \"logic-programming\"\n",
42+
"\n",
43+
"# Step 1: Load JSON into a DuckDB temp table with parallel processing enabled\n",
44+
"con = duckdb.connect(database=':memory:')\n",
45+
"con.execute(\"SET threads TO 10;\") # Adjust number based on your CPU cores\n",
46+
"con.execute(\"\"\"\n",
47+
" CREATE TEMP TABLE repo AS \n",
48+
" SELECT * FROM read_json_auto('../public/data/repo_metadata.json');\n",
49+
"\"\"\")\n",
50+
"\n",
51+
"# Step 2: Get nameWithOwner and topics into a pandas DataFrame\n",
52+
"query = \"SELECT nameWithOwner, topics FROM repo\"\n",
53+
"df = con.execute(query).fetchdf()\n",
54+
"\n",
55+
"# Step 3: Normalize topics into list of names\n",
56+
"def extract_names(item_ls):\n",
57+
" if item_ls is not None and len(item_ls) > 0:\n",
58+
" return [item[\"name\"] for item in item_ls if \"name\" in item]\n",
59+
" return []\n",
60+
"\n",
61+
"df[\"topics\"] = df[\"topics\"].apply(extract_names)\n",
62+
"\n",
63+
"# Step 4: Filter repos based on search term in topics\n",
64+
"filtered_df = df[df[\"topics\"].apply(lambda x: search_term in [t.lower() for t in x])]\n",
65+
"\n",
66+
"# Step 5: Count all co-occurring topics\n",
67+
"all_topics = [topic for topics in filtered_df[\"topics\"] for topic in topics]\n",
68+
"topic_counts = Counter(all_topics)\n",
69+
"\n",
70+
"# Remove the searched topic itself\n",
71+
"topic_counts.pop(search_term, None)\n",
72+
"\n",
73+
"# Step 6: Convert to list of dicts and sort, only including topics with count > 1\n",
74+
"topics = [{\"name\": name, \"count\": count} for name, count in topic_counts.items() if count > 2]\n",
75+
"topics = sorted(topics, key=lambda x: x[\"count\"], reverse=True)"
76+
]
77+
}
78+
],
79+
"metadata": {
80+
"kernelspec": {
81+
"display_name": "Python 3 (ipykernel)",
82+
"language": "python",
83+
"name": "python3"
84+
},
85+
"language_info": {
86+
"codemirror_mode": {
87+
"name": "ipython",
88+
"version": 3
89+
},
90+
"file_extension": ".py",
91+
"mimetype": "text/x-python",
92+
"name": "python",
93+
"nbconvert_exporter": "python",
94+
"pygments_lexer": "ipython3",
95+
"version": "3.10.13"
96+
}
97+
},
98+
"nbformat": 4,
99+
"nbformat_minor": 5
100+
}

backend/topic_processor.py

Lines changed: 33 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,8 @@
55
from collections import Counter
66
from concurrent.futures import ThreadPoolExecutor
77
from functools import partial
8+
import os
9+
from pathlib import Path
810

911
app = Flask(__name__)
1012
# Configure CORS to allow all origins and methods
@@ -16,6 +18,21 @@
1618
}
1719
})
1820

21+
# Add this function to handle cache operations
22+
def get_cached_topics(search_term):
23+
cache_file = Path('../public/data/cached_topics') / f"{search_term}.json"
24+
if cache_file.exists():
25+
with open(cache_file, 'r') as f:
26+
return json.load(f)
27+
return None
28+
29+
def save_cached_topics(search_term, topics_data):
30+
cache_dir = Path('../public/data/cached_topics')
31+
cache_dir.mkdir(exist_ok=True)
32+
cache_file = cache_dir / f"{search_term}.json"
33+
with open(cache_file, 'w') as f:
34+
json.dump(topics_data, f)
35+
1936
@app.route('/process-topics', methods=['GET', 'POST'])
2037
def process_topics():
2138
try:
@@ -25,6 +42,17 @@ def process_topics():
2542
else: # GET request
2643
search_term = request.args.get('searchTerm', '').lower()
2744

45+
# Check if we have cached results
46+
cached_result = get_cached_topics(search_term)
47+
if cached_result:
48+
return jsonify({
49+
"success": True,
50+
"data": cached_result,
51+
"total": len(cached_result),
52+
"cached": True
53+
})
54+
55+
# If not cached, proceed with the original processing
2856
# Step 1: Load JSON into a DuckDB temp table with parallel processing enabled
2957
con = duckdb.connect(database=':memory:')
3058
con.execute("SET threads TO 16;") # Adjust number based on your CPU cores
@@ -59,10 +87,14 @@ def extract_names(item_ls):
5987
topics = [{"name": name, "count": count} for name, count in topic_counts.items() if count > 2]
6088
topics = sorted(topics, key=lambda x: x["count"], reverse=True)
6189

90+
# Before returning, cache the results
91+
save_cached_topics(search_term, topics)
92+
6293
return jsonify({
6394
"success": True,
6495
"data": topics,
65-
"total": len(topics)
96+
"total": len(topics),
97+
"cached": False
6698
})
6799

68100
except Exception as e:

0 commit comments

Comments
 (0)