feat: review comments

akihikokuroda · akihikokuroda · commit 3fc5fbd30773 · 2026-04-02T13:00:04.000-04:00
Signed-off-by: Akihiko Kuroda &lt;akihikokuroda2020@gmail.com&gt;
diff --git a/mellea/stdlib/requirements/__init__.py b/mellea/stdlib/requirements/__init__.py
@@ -4,7 +4,7 @@
 from ...core import Requirement, ValidationResult, default_output_to_bool
 from .md import as_markdown_list, is_markdown_list, is_markdown_table
 from .python_reqs import PythonExecutionReq
-from .rag import CitationRequirement
+from .rag import CitationMode, CitationRequirement
 from .requirement import (
     ALoraRequirement,
     LLMaJRequirement,
@@ -18,6 +18,7 @@
 
 __all__ = [
     "ALoraRequirement",
+    "CitationMode",
     "CitationRequirement",
     "LLMaJRequirement",
     "PythonExecutionReq",
diff --git a/mellea/stdlib/requirements/rag.py b/mellea/stdlib/requirements/rag.py
@@ -1,12 +1,27 @@
 """Requirements for RAG (Retrieval-Augmented Generation) workflows."""
 
 from collections.abc import Iterable
+from enum import Enum
 
 from ...backends.adapters import AdapterMixin
 from ...core import Backend, Context, Requirement, ValidationResult
 from ..components import Document, Message
 
 
+class CitationMode(Enum):
+    """Mode for calculating citation coverage.
+
+    Attributes:
+        CLAIMS: Count the fraction of factual claims that have citations.
+            Each citation record from find_citations represents one claim.
+        CHARACTERS: Calculate the ratio of cited characters to total characters.
+            Sums character ranges covered by citations.
+    """
+
+    CLAIMS = "claims"
+    CHARACTERS = "characters"
+
+
 class CitationRequirement(Requirement):
     """Requirement that validates RAG responses have adequate citation coverage.
 
@@ -30,14 +45,19 @@ class CitationRequirement(Requirement):
 
     Args:
         min_citation_coverage: Minimum ratio of cited content (0.0-1.0).
-            The ratio of characters with citations to total response length
-            must meet or exceed this threshold. Default is 0.8 (80% coverage).
+            Interpretation depends on mode:
+            - CLAIMS mode: fraction of factual claims with citations
+            - CHARACTERS mode: ratio of cited characters to total characters
+            Default is 0.8 (80% coverage).
         documents: Optional documents to validate against. Can be Document
             objects or strings (will be converted to Documents). If provided,
             these documents will be used instead of documents attached to
             messages in the context. Default is None (use context documents).
+        mode: Citation coverage calculation mode. Default is CitationMode.CLAIMS
+            (count fraction of claims with citations). Use CitationMode.CHARACTERS
+            to calculate character-based coverage ratio instead.
         description: Custom description for the requirement. If None,
-            generates a description based on coverage threshold.
+            generates a description based on coverage threshold and mode.
 
     Example:
         ```python
@@ -64,6 +84,7 @@ def __init__(
         self,
         min_citation_coverage: float = 0.8,
         documents: Iterable[Document] | Iterable[str] | None = None,
+        mode: CitationMode = CitationMode.CLAIMS,
         description: str | None = None,
     ):
         """Initialize citation coverage requirement."""
@@ -73,6 +94,7 @@ def __init__(
             )
 
         self.min_citation_coverage = min_citation_coverage
+        self.mode = mode
 
         # Convert documents to Document objects if provided
         if documents is not None:
@@ -87,10 +109,16 @@ def __init__(
 
         # Generate description if not provided
         if description is None:
-            description = (
-                f"Response must have adequate citation coverage "
-                f"(minimum {min_citation_coverage * 100:.0f}% of content cited)"
-            )
+            if mode == CitationMode.CLAIMS:
+                description = (
+                    f"Response must have adequate citation coverage "
+                    f"(minimum {min_citation_coverage * 100:.0f}% of factual claims cited)"
+                )
+            else:  # CitationMode.CHARACTERS
+                description = (
+                    f"Response must have adequate citation coverage "
+                    f"(minimum {min_citation_coverage * 100:.0f}% of characters cited)"
+                )
 
         # Initialize parent without validation function - we override validate() instead
         super().__init__(description=description, validation_fn=None)
@@ -198,13 +226,34 @@ async def validate(
                 False, reason=f"Citation detection intrinsic failed: {e!s}"
             )
 
-        # Calculate citation coverage
-
-        cited_chars = sum(
-            citation["response_end"] - citation["response_begin"]
-            for citation in citations
-        )
-        coverage_ratio = cited_chars / total_chars
+        # Calculate citation coverage based on mode
+        if self.mode == CitationMode.CLAIMS:
+            # Count fraction of claims (citation records) that exist
+            # Each citation record represents a factual claim that has a citation
+            # We need to estimate total claims in the response
+            # For now, use a simple heuristic: split by sentence-ending punctuation
+            import re
+
+            # Split response into sentences (simple heuristic)
+            sentences = re.split(r"[.!?]+", response)
+            # Filter out empty strings and whitespace-only strings
+            sentences = [s.strip() for s in sentences if s.strip()]
+            total_claims = len(sentences)
+
+            if total_claims == 0:
+                # Edge case: no sentences detected
+                coverage_ratio = 1.0 if len(citations) == 0 else 0.0
+            else:
+                # Number of claims with citations = number of citation records
+                cited_claims = len(citations)
+                coverage_ratio = cited_claims / total_claims
+        else:  # CitationMode.CHARACTERS
+            # Calculate character-based coverage
+            cited_chars = sum(
+                citation["response_end"] - citation["response_begin"]
+                for citation in citations
+            )
+            coverage_ratio = cited_chars / total_chars
 
         # Check against min_citation_coverage
         passed = coverage_ratio >= self.min_citation_coverage
@@ -231,15 +280,20 @@ def _build_reason(
         coverage_pct = coverage_ratio * 100
         threshold_pct = self.min_citation_coverage * 100
 
+        if self.mode == CitationMode.CLAIMS:
+            metric_name = "claims"
+        else:
+            metric_name = "characters"
+
         if passed:
             reason = (
                 f"Response has adequate citation coverage "
-                f"({coverage_pct:.1f}% cited, threshold: {threshold_pct:.1f}%)"
+                f"({coverage_pct:.1f}% of {metric_name} cited, threshold: {threshold_pct:.1f}%)"
             )
         else:
             reason = (
                 f"Response has insufficient citation coverage "
-                f"({coverage_pct:.1f}% cited, threshold: {threshold_pct:.1f}%)"
+                f"({coverage_pct:.1f}% of {metric_name} cited, threshold: {threshold_pct:.1f}%)"
             )
 
         # Add details about citations
diff --git a/test/stdlib/requirements/test_rag_requirements.py b/test/stdlib/requirements/test_rag_requirements.py
@@ -6,7 +6,7 @@
 from mellea.backends.huggingface import LocalHFBackend
 from mellea.stdlib.components import Document, Message
 from mellea.stdlib.context import ChatContext
-from mellea.stdlib.requirements.rag import CitationRequirement
+from mellea.stdlib.requirements.rag import CitationMode, CitationRequirement
 
 
 @pytest.mark.huggingface
@@ -269,18 +269,137 @@ async def test_citation_requirement_threshold_boundary():
         "mellea.stdlib.components.intrinsic.rag.find_citations",
         return_value=mock_citations,
     ):
-        # Test at exact threshold (0.8)
-        req = CitationRequirement(min_citation_coverage=0.8)
+        # Test at exact threshold (0.8) with CHARACTERS mode
+        req = CitationRequirement(
+            min_citation_coverage=0.8, mode=CitationMode.CHARACTERS
+        )
         result = await req.validate(backend, ctx)
 
         # At exact threshold, should pass (>= comparison)
         assert result.as_bool()
         assert result.score == 0.8
 
-        # Test just below threshold (0.81)
-        req_above = CitationRequirement(min_citation_coverage=0.81)
+        # Test just below threshold (0.81) with CHARACTERS mode
+        req_above = CitationRequirement(
+            min_citation_coverage=0.81, mode=CitationMode.CHARACTERS
+        )
         result_above = await req_above.validate(backend, ctx)
 
         # Just below threshold, should fail
         assert not result_above.as_bool()
         assert result_above.score == 0.8
+
+
+async def test_citation_requirement_claims_mode():
+    """Test citation requirement with CLAIMS mode."""
+    from unittest.mock import Mock, patch
+
+    backend = Mock(spec=LocalHFBackend)
+
+    # Create documents
+    docs = [
+        Document(doc_id="doc1", text="The sky is blue."),
+        Document(doc_id="doc2", text="Grass is green."),
+    ]
+
+    # Create a response with 3 sentences
+    response = "The sky is blue. Grass is green. Water is wet."
+
+    # Create context
+    ctx = ChatContext().add(Message("user", "Tell me some facts."))
+    ctx = ctx.add(Message("assistant", response, documents=docs))
+
+    # Mock find_citations to return 2 citations (2 out of 3 claims = 66.7%)
+    mock_citations = [
+        {
+            "response_begin": 0,
+            "response_end": 16,
+            "response_text": "The sky is blue",
+            "citation_doc_id": "doc1",
+            "citation_text": "The sky is blue.",
+        },
+        {
+            "response_begin": 18,
+            "response_end": 34,
+            "response_text": "Grass is green",
+            "citation_doc_id": "doc2",
+            "citation_text": "Grass is green.",
+        },
+    ]
+
+    with patch(
+        "mellea.stdlib.components.intrinsic.rag.find_citations",
+        return_value=mock_citations,
+    ):
+        # Test with CLAIMS mode (default) - 2 citations out of 3 sentences = 66.7%
+        req = CitationRequirement(min_citation_coverage=0.6)
+        result = await req.validate(backend, ctx)
+
+        # Should pass (66.7% >= 60%)
+        assert result.as_bool()
+        assert result.score is not None
+        assert abs(result.score - 0.667) < 0.01  # Allow small floating point error
+        assert result.reason is not None
+        assert "claims" in result.reason
+
+        # Test with higher threshold that should fail
+        req_high = CitationRequirement(min_citation_coverage=0.7)
+        result_high = await req_high.validate(backend, ctx)
+
+        # Should fail (66.7% < 70%)
+        assert not result_high.as_bool()
+        assert result_high.score is not None
+        assert abs(result_high.score - 0.667) < 0.01
+
+
+async def test_citation_requirement_characters_vs_claims():
+    """Test that CHARACTERS and CLAIMS modes produce different results."""
+    from unittest.mock import Mock, patch
+
+    backend = Mock(spec=LocalHFBackend)
+
+    # Create documents
+    docs = [Document(doc_id="doc1", text="Short fact.")]
+
+    # Create a response: 1 short sentence with citation, 1 long sentence without
+    response = "Short. This is a much longer sentence without any citation support."
+
+    # Create context
+    ctx = ChatContext().add(Message("user", "Tell me something."))
+    ctx = ctx.add(Message("assistant", response, documents=docs))
+
+    # Mock find_citations to return 1 citation for the short sentence
+    mock_citations = [
+        {
+            "response_begin": 0,
+            "response_end": 6,  # "Short." = 6 characters
+            "response_text": "Short",
+            "citation_doc_id": "doc1",
+            "citation_text": "Short fact.",
+        }
+    ]
+
+    with patch(
+        "mellea.stdlib.components.intrinsic.rag.find_citations",
+        return_value=mock_citations,
+    ):
+        # CLAIMS mode: 1 citation out of 2 sentences = 50%
+        req_claims = CitationRequirement(
+            min_citation_coverage=0.5, mode=CitationMode.CLAIMS
+        )
+        result_claims = await req_claims.validate(backend, ctx)
+
+        # CHARACTERS mode: 6 characters out of 67 total = ~9%
+        req_chars = CitationRequirement(
+            min_citation_coverage=0.5, mode=CitationMode.CHARACTERS
+        )
+        result_chars = await req_chars.validate(backend, ctx)
+
+        # CLAIMS mode should pass (50% >= 50%)
+        assert result_claims.as_bool()
+        assert result_claims.score == 0.5
+
+        # CHARACTERS mode should fail (~9% < 50%)
+        assert not result_chars.as_bool()
+        assert result_chars.score is not None
+        assert result_chars.score < 0.1