PicoCode/ai/llama_chunker.py at 082203265254c45909c7627fe608ae49bd46ff6d · CodeAtCode/PicoCode · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
"""
LlamaIndex-based chunking for code and text.
Replaces smart_chunker.py with llama-index's built-in splitters.
"""
from typing import List
from llama_index.core.node_parser import CodeSplitter, SentenceSplitter
from llama_index.core.schema import Document

from utils.logger import get_logger

logger = get_logger(__name__)


def chunk_with_llama_index(
    content: str,
    language: str = "text",
    chunk_size: int = 800,
    chunk_overlap: int = 100
) -> List[str]:
    """
    Chunk text or code using llama-index's splitters.

    Args:
        content: Text or code content to chunk
        language: Programming language (python, javascript, etc.) or "text"
        chunk_size: Target size for each chunk in characters
        chunk_overlap: Overlap between chunks in characters

    Returns:
        List of text chunks
    """
    # Map language names to llama-index language identifiers
    language_map = {
        "python": "python",
        "javascript": "js",
        "typescript": "ts",
        "java": "java",
        "go": "go",
        "rust": "rust",
        "c": "c",
        "cpp": "cpp",
        "c++": "cpp",
    }

    try:
        # Check if it's a supported code language
        llama_lang = language_map.get(language.lower())

        if llama_lang:
            # Use CodeSplitter for code
            splitter = CodeSplitter(
                language=llama_lang,
                chunk_lines=40,  # Target lines per chunk (approximation)
                chunk_lines_overlap=5,  # Overlap in lines
                max_chars=chunk_size
            )
            logger.debug(f"Using CodeSplitter for language: {llama_lang}")
        else:
            # Use SentenceSplitter for text or unknown languages
            splitter = SentenceSplitter(
                chunk_size=chunk_size,
                chunk_overlap=chunk_overlap,
                paragraph_separator="\n\n",
                secondary_chunking_regex="[^,.;。？！]+[,.;。？！]?"
            )
            logger.debug(f"Using SentenceSplitter for language: {language}")

        # Create a document and split it
        doc = Document(text=content)
        nodes = splitter.get_nodes_from_documents([doc])

        # Extract text from nodes
        chunks = [node.text for node in nodes if node.text]

        logger.debug(f"Split content into {len(chunks)} chunks")
        return chunks if chunks else [content]

    except Exception as e:
        logger.exception(f"Error chunking with llama-index: {e}")
        # Fallback to simple chunking
        return simple_chunk(content, chunk_size, chunk_overlap)


def simple_chunk(text: str, chunk_size: int = 800, chunk_overlap: int = 100) -> List[str]:
    """
    Simple character-based chunking fallback.

    Args:
        text: Text to chunk
        chunk_size: Size of each chunk
        chunk_overlap: Overlap between chunks

    Returns:
        List of text chunks
    """
    if not text:
        return []

    chunks = []
    step = max(1, chunk_size - chunk_overlap)

    for i in range(0, len(text), step):
        end = min(i + chunk_size, len(text))
        chunk = text[i:end]
        if chunk.strip():
            chunks.append(chunk)

        if end >= len(text):
            break

    return chunks if chunks else [text]