data-exp-lab
diff --git a/‎.gitignore‎
Lines changed: 1 addition & 1 deletion b/‎.gitignore‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎backend/duckdb_test.ipynb‎
Lines changed: 100 additions & 0 deletions b/‎backend/duckdb_test.ipynb‎
Lines changed: 100 additions & 0 deletions
diff --git a/‎backend/topic_processor.py‎
Lines changed: 33 additions & 1 deletion b/‎backend/topic_processor.py‎
Lines changed: 33 additions & 1 deletion
@@ -5,7 +5,7 @@ node_modules
 .env.development.local
 .env.test.local
 .env.production.local
-*.ipython_checkpoints
+.ipynb_checkpoints/
 dist
 .DS_Store
 *temp
 
@@ -0,0 +1,100 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 14,
+   "id": "9cef20d5-c51f-4597-afd9-456c7c26f5f4",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "13d1ff9f76744ae397b1c5ca7e8c6498",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "a4efdf3931044bdbb61a128cd1496dd5",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    }
+   ],
+   "source": [
+    "import duckdb\n",
+    "\n",
+    "search_term = \"logic-programming\"\n",
+    "\n",
+    "# Step 1: Load JSON into a DuckDB temp table with parallel processing enabled\n",
+    "con = duckdb.connect(database=':memory:')\n",
+    "con.execute(\"SET threads TO 10;\")  # Adjust number based on your CPU cores\n",
+    "con.execute(\"\"\"\n",
+    "    CREATE TEMP TABLE repo AS \n",
+    "    SELECT * FROM read_json_auto('../public/data/repo_metadata.json');\n",
+    "\"\"\")\n",
+    "\n",
+    "# Step 2: Get nameWithOwner and topics into a pandas DataFrame\n",
+    "query = \"SELECT nameWithOwner, topics FROM repo\"\n",
+    "df = con.execute(query).fetchdf()\n",
+    "\n",
+    "# Step 3: Normalize topics into list of names\n",
+    "def extract_names(item_ls):\n",
+    "    if item_ls is not None and len(item_ls) > 0:\n",
+    "        return [item[\"name\"] for item in item_ls if \"name\" in item]\n",
+    "    return []\n",
+    "\n",
+    "df[\"topics\"] = df[\"topics\"].apply(extract_names)\n",
+    "\n",
+    "# Step 4: Filter repos based on search term in topics\n",
+    "filtered_df = df[df[\"topics\"].apply(lambda x: search_term in [t.lower() for t in x])]\n",
+    "\n",
+    "# Step 5: Count all co-occurring topics\n",
+    "all_topics = [topic for topics in filtered_df[\"topics\"] for topic in topics]\n",
+    "topic_counts = Counter(all_topics)\n",
+    "\n",
+    "# Remove the searched topic itself\n",
+    "topic_counts.pop(search_term, None)\n",
+    "\n",
+    "# Step 6: Convert to list of dicts and sort, only including topics with count > 1\n",
+    "topics = [{\"name\": name, \"count\": count} for name, count in topic_counts.items() if count > 2]\n",
+    "topics = sorted(topics, key=lambda x: x[\"count\"], reverse=True)"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.13"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
@@ -5,6 +5,8 @@
 from collections import Counter
 from concurrent.futures import ThreadPoolExecutor
 from functools import partial
+import os
+from pathlib import Path
 
 app = Flask(__name__)
 # Configure CORS to allow all origins and methods
@@ -16,6 +18,21 @@
     }
 })
 
+# Add this function to handle cache operations
+def get_cached_topics(search_term):
+    cache_file = Path('../public/data/cached_topics') / f"{search_term}.json"
+    if cache_file.exists():
+        with open(cache_file, 'r') as f:
+            return json.load(f)
+    return None
+
+def save_cached_topics(search_term, topics_data):
+    cache_dir = Path('../public/data/cached_topics')
+    cache_dir.mkdir(exist_ok=True)
+    cache_file = cache_dir / f"{search_term}.json"
+    with open(cache_file, 'w') as f:
+        json.dump(topics_data, f)
+
 @app.route('/process-topics', methods=['GET', 'POST'])
 def process_topics():
     try:
@@ -25,6 +42,17 @@ def process_topics():
         else:  # GET request
             search_term = request.args.get('searchTerm', '').lower()
 
+        # Check if we have cached results
+        cached_result = get_cached_topics(search_term)
+        if cached_result:
+            return jsonify({
+                "success": True,
+                "data": cached_result,
+                "total": len(cached_result),
+                "cached": True
+            })
+
+        # If not cached, proceed with the original processing
         # Step 1: Load JSON into a DuckDB temp table with parallel processing enabled
         con = duckdb.connect(database=':memory:')
         con.execute("SET threads TO 16;")  # Adjust number based on your CPU cores
@@ -59,10 +87,14 @@ def extract_names(item_ls):
         topics = [{"name": name, "count": count} for name, count in topic_counts.items() if count > 2]
         topics = sorted(topics, key=lambda x: x["count"], reverse=True)
 
+        # Before returning, cache the results
+        save_cached_topics(search_term, topics)
+        
         return jsonify({
             "success": True,
             "data": topics,
-            "total": len(topics)
+            "total": len(topics),
+            "cached": False
         })
 
     except Exception as e: