data-exp-lab
diff --git a/‎.gitignore‎
Lines changed: 2 additions & 1 deletion b/‎.gitignore‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎backend/app/services/topic_service.py‎
Lines changed: 56 additions & 36 deletions b/‎backend/app/services/topic_service.py‎
Lines changed: 56 additions & 36 deletions
diff --git a/‎backend/requirements.txt‎
Lines changed: 2 additions & 1 deletion b/‎backend/requirements.txt‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎backend/test/duckdb_create.ipynb‎
Lines changed: 165 additions & 0 deletions b/‎backend/test/duckdb_create.ipynb‎
Lines changed: 165 additions & 0 deletions
@@ -10,4 +10,5 @@ dist
 .DS_Store
 *temp
 *repo_metadata.json
-__pycache__
+__pycache__
+*.duckdb
@@ -1,62 +1,82 @@
 from collections import Counter
 import duckdb
 from app.utils.cache import get_cached_topics, save_cached_topics
+import os
 
 class TopicService:
+    # Define the allowed terms for caching
+    CACHEABLE_TERMS = {
+        "visual-programming",
+        "machine-learning",
+        "logic-programming",
+        "large-language-models"
+    }
+
     def __init__(self):
-        self.con = duckdb.connect(database=':memory:')
-        self.con.execute("SET threads TO 16;")
-        self.con.execute("""
-            CREATE TEMP TABLE repo AS 
-            SELECT * FROM read_json_auto('../public/data/repo_metadata.json');
-        """)
+        db_path = '../public/data/github_meta.duckdb'
+        
+        # Check if database exists
+        if os.path.exists(db_path):
+            # Connect in read-only mode to avoid locking issues
+            self.con = duckdb.connect(database=db_path, read_only=True)
+            self.con.execute("SET threads TO 8;")
+        else:
+            raise FileNotFoundError(
+                f"Database not found at {db_path}. Please ensure the database file exists before running the application."
+            )
 
     def process_topics(self, search_term: str):
         try:
             search_term = search_term.lower()
 
-            # Check cache
-            cached_result = get_cached_topics(search_term)
-            if cached_result:
-                return {
-                    "success": True,
-                    "data": cached_result,
-                    "total": len(cached_result),
-                    "cached": True
-                }
+            # Only check cache for allowed terms
+            if search_term in self.CACHEABLE_TERMS:
+                cached_result = get_cached_topics(search_term)
+                if cached_result:
+                    return {
+                        "success": True,
+                        "data": cached_result,
+                        "total": len(cached_result),
+                        "cached": True
+                    }
 
-            # Get data from DuckDB
-            query = "SELECT nameWithOwner, topics FROM repo"
+            # Get data from normalized tables in DuckDB
+            query = """
+                SELECT r.nameWithOwner, t.topic
+                FROM repos r
+                JOIN repo_topics t ON r.nameWithOwner = t.repo
+            """
             df = self.con.execute(query).fetchdf()
-            
-            # Process topics
-            def extract_names(item_ls):
-                if item_ls is not None and len(item_ls) > 0:
-                    return [item["name"] for item in item_ls if "name" in item]
-                return []
-            
-            df["topics"] = df["topics"].apply(extract_names)
-            filtered_df = df[df["topics"].apply(lambda x: search_term in [t.lower() for t in x])]
-            
-            # Count topics
+
+            # Group topics by repo into a list
+            grouped = df.groupby("nameWithOwner")["topic"].apply(list).reset_index()
+            grouped.columns = ["nameWithOwner", "topics"]
+
+            # Filter repos based on search term in topics
+            filtered_df = grouped[grouped["topics"].apply(lambda x: search_term in [t.lower() for t in x])]
+
+            # Count all co-occurring topics
             all_topics = [topic for topics in filtered_df["topics"] for topic in topics]
-            topic_counts = Counter(all_topics)
+            topic_counts = Counter([t.lower() for t in all_topics])
+
+            # Remove the searched topic itself
             topic_counts.pop(search_term, None)
-            
-            # Format results
+
+            # Format results and sort, only including topics with count > 2
             topics = [{"name": name, "count": count} for name, count in topic_counts.items() if count > 2]
             topics = sorted(topics, key=lambda x: x["count"], reverse=True)
-            
-            # Cache results
-            save_cached_topics(search_term, topics)
-            
+
+            # Only cache results for allowed terms
+            if search_term in self.CACHEABLE_TERMS:
+                save_cached_topics(search_term, topics)
+
             return {
                 "success": True,
                 "data": topics,
                 "total": len(topics),
                 "cached": False
             }
-        
+
         except Exception as e:
             return {
                 "success": False,
 
@@ -4,4 +4,5 @@ duckdb
 google-generativeai
 openai
 fastapi
-uvicorn 
+uvicorn
+kaggle>=1.5.16 
@@ -0,0 +1,165 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "id": "b637b442-3873-4c23-9159-7a4f00b98e86",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "7a52c853f3d348a490288979f5947d6e",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "2607eeec41a24eb8afec4c73143d41e9",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "DuckDB database created at repos.duckdb\n"
+     ]
+    }
+   ],
+   "source": [
+    "import duckdb\n",
+    "import pandas as pd\n",
+    "import json\n",
+    "import os\n",
+    "\n",
+    "# Path to your JSON file\n",
+    "json_path = '../../public/data/repo_metadata.json'  # Replace with your actual file path\n",
+    "db_path = 'github_metadata.duckdb'      # Output DB file\n",
+    "\n",
+    "# Load the JSON data\n",
+    "with open(json_path, 'r') as f:\n",
+    "    data = json.load(f)\n",
+    "\n",
+    "# Ensure data is a list of repos\n",
+    "if isinstance(data, dict):\n",
+    "    data = [data]\n",
+    "\n",
+    "# Flatten top-level fields\n",
+    "repos = []\n",
+    "languages = []\n",
+    "topics = []\n",
+    "\n",
+    "for repo in data:\n",
+    "    repo_id = repo.get(\"nameWithOwner\")\n",
+    "\n",
+    "    # Flatten main repo info\n",
+    "    repos.append({\n",
+    "        \"nameWithOwner\": repo_id,\n",
+    "        \"owner\": repo.get(\"owner\"),\n",
+    "        \"name\": repo.get(\"name\"),\n",
+    "        \"stars\": repo.get(\"stars\"),\n",
+    "        \"forks\": repo.get(\"forks\"),\n",
+    "        \"watchers\": repo.get(\"watchers\"),\n",
+    "        \"isFork\": repo.get(\"isFork\"),\n",
+    "        \"isArchived\": repo.get(\"isArchived\"),\n",
+    "        \"languageCount\": repo.get(\"languageCount\"),\n",
+    "        \"topicCount\": repo.get(\"topicCount\"),\n",
+    "        \"diskUsageKb\": repo.get(\"diskUsageKb\"),\n",
+    "        \"pullRequests\": repo.get(\"pullRequests\"),\n",
+    "        \"issues\": repo.get(\"issues\"),\n",
+    "        \"description\": repo.get(\"description\"),\n",
+    "        \"primaryLanguage\": repo.get(\"primaryLanguage\"),\n",
+    "        \"createdAt\": repo.get(\"createdAt\"),\n",
+    "        \"pushedAt\": repo.get(\"pushedAt\"),\n",
+    "        \"defaultBranchCommitCount\": repo.get(\"defaultBranchCommitCount\"),\n",
+    "        \"license\": repo.get(\"license\"),\n",
+    "        \"assignableUserCount\": repo.get(\"assignableUserCount\"),\n",
+    "        \"codeOfConduct\": repo.get(\"codeOfConduct\"),\n",
+    "        \"forkingAllowed\": repo.get(\"forkingAllowed\"),\n",
+    "        \"parent\": repo.get(\"parent\")\n",
+    "    })\n",
+    "\n",
+    "    for lang in repo.get(\"languages\", []):\n",
+    "        languages.append({\n",
+    "            \"repo\": repo_id,\n",
+    "            \"language\": lang.get(\"name\"),\n",
+    "            \"size\": lang.get(\"size\")\n",
+    "        })\n",
+    "\n",
+    "    for topic in repo.get(\"topics\", []):\n",
+    "        topics.append({\n",
+    "            \"repo\": repo_id,\n",
+    "            \"topic\": topic.get(\"name\"),\n",
+    "            \"stars\": topic.get(\"stars\")\n",
+    "        })\n",
+    "\n",
+    "# Convert to DataFrames\n",
+    "df_repos = pd.DataFrame(repos)\n",
+    "df_languages = pd.DataFrame(languages)\n",
+    "df_topics = pd.DataFrame(topics)\n",
+    "\n",
+    "# Create DuckDB and insert tables\n",
+    "con = duckdb.connect(db_path)\n",
+    "con.execute(\"SET threads TO 4;\")\n",
+    "\n",
+    "con.register('df_repos', df_repos)\n",
+    "con.execute(\"CREATE OR REPLACE TABLE repos AS SELECT * FROM df_repos;\")\n",
+    "\n",
+    "con.register('df_languages', df_languages)\n",
+    "con.execute(\"CREATE OR REPLACE TABLE repo_languages AS SELECT * FROM df_languages;\")\n",
+    "\n",
+    "con.register('df_topics', df_topics)\n",
+    "con.execute(\"CREATE OR REPLACE TABLE repo_topics AS SELECT * FROM df_topics;\")\n",
+    "\n",
+    "con.close()\n",
+    "\n",
+    "print(f\"DuckDB database created at {db_path}\")\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "2442785a-d821-47ab-bf16-86ceab3af02b",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.13"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}