ai-document-processing-pipeline/src/AIDocumentPipeline/shared/confidence/document_intelligence_confidence.py at main · Azure/ai-document-processing-pipeline · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
import copy
from typing import Iterable, Optional
from azure.ai.documentintelligence.models import AnalyzeResult, DocumentPage, DocumentLine, DocumentWord
from shared.confidence.confidence_utils import get_confidence_values
from shared.confidence.confidence_result import OVERALL_CONFIDENCE_KEY
from shared.utils.value_utils import value_contains, value_match
from concurrent.futures import ThreadPoolExecutor, as_completed


class DIDocumentLine(DocumentLine):
    """
    A class representing a line in a document extracted by Azure AI Document Intelligence with additional attributes.

    Attributes:
        normalized_polygon (Optional[list[dict[str, int]]]): The normalized polygon coordinates of the document line.
        confidence (float): The confidence score of the document line.
        page_number (int): The page number where the document line is located.
        contained_words (list[DocumentWord]): The list of words contained in the document line.
    """

    def __init__(
        self,
        normalized_polygon: Optional[list[dict[str, int]]],
        confidence: float,
        page_number: int,
        contained_words: list[DocumentWord],
        *args: any,
        **kwargs: any
    ) -> None:
        """
        Initializes a new instance of the DIDocumentLine class based on a DocumentLine instance.

        Args:
            normalized_polygon: The normalized polygon coordinates of the document line.
            confidence: The confidence score of the document line.
            page_number: The page number where the document line is located.
            contained_words: The list of words contained in the document line.
        """

        super().__init__(*args, **kwargs)
        self.normalized_polygon = normalized_polygon
        self.confidence = confidence
        self.page_number = page_number
        self.contained_words = contained_words

    normalized_polygon: Optional[list[dict[str, int]]]
    confidence: float
    page_number: int
    contained_words: list[DocumentWord]

    def to_dict(self):
        """
        Converts the DIDocumentLine instance to a dictionary.

        Returns:
            dict: The dictionary representation of the DIDocumentLine instance including the base DocumentLine attributes.
        """

        as_dict = self.as_dict()
        as_dict['normalized_polygon'] = self.normalized_polygon
        as_dict['confidence'] = self.confidence
        as_dict['page_number'] = self.page_number
        as_dict['contained_words'] = self.contained_words

        return as_dict


class DIDocumentWord(DocumentWord):
    """
    A class representing a document word extracted by Azure AI Document Intelligence with additional attributes.

    Attributes:
        normalized_polygon (Optional[list[dict[str, int]]]): The normalized polygon coordinates of the document word.
        page_number (int): The page number where the document word is located.
        content_type (str): The content type of the document word.
    """

    def __init__(
        self,
        normalized_polygon: Optional[list[dict[str, int]]],
        page_number: int,
        *args: any,
        **kwargs: any
    ) -> None:
        """
        Initializes a new DIDocumentWord instance based on a DocumentWord instance.

        Args:
            normalized_polygon: The normalized polygon coordinates of the document word.
            page_number: The page number where the document word is located.
        """

        super().__init__(*args, **kwargs)
        self.normalized_polygon = normalized_polygon
        self.page_number = page_number

    normalized_polygon: Optional[list[dict[str, int]]]
    page_number: int

    def to_dict(self):
        """
        Converts the DIDocumentWord instance to a dictionary.

        Returns:
            dict: The dictionary representation of the DIDocumentWord instance including the base DocumentWord attributes.
        """

        as_dict = self.as_dict()
        as_dict['normalized_polygon'] = self.normalized_polygon
        as_dict['page_number'] = self.page_number

        return as_dict


def normalize_polygon(
    page: DocumentPage,
    polygon: list[float]
) -> list[dict[str, int]]:
    """
    Normalize a polygon's coordinates to page dimensions.
    The polygon is represented as a list of x, y coordinates starting from the top-left corner of the page and moving clockwise.

    Args:
        page: The page to normalize the polygon to.
        polygon: The polygon coordinates on the page to normalize.

    Returns:
        list: The normalized polygon coordinates as a list of dictionaries with 'x' and 'y' keys.
    """

    result = list()

    for i in range(0, len(polygon), 2):
        x = polygon[i]
        y = polygon[i + 1]

        # Normalize the coordinates to the page dimensions
        x = round(x / page.width, 3)
        y = round(y / page.height, 3)

        result.append({
            'x': x,
            'y': y
        })

    return result


def extract_lines(
    analyze_result: AnalyzeResult,
    multiple_score_resolver: callable = min
) -> list[DIDocumentLine]:
    """
    Extract lines from the Azure AI Document Intelligence analysis result, enriching with confidence, contained words, and normalized polygons.

    Args:
        result: The Azure AI Document Intelligence analysis result to extract lines from.
        multiple_score_resolver: The function to resolve multiple confidence scores of contained words.

    Returns:
        list: The list of DIDocumentLine instances extracted from the analysis result.
    """

    di_lines = list()
    for page_number, page in enumerate(analyze_result.pages):
        for line in page.lines:
            line_copy = copy.copy(line)
            contained_words = list()
            for span in line_copy.spans:
                # Find words in the page that are fully contained within the span
                span_offset_start = span.offset
                span_offset_end = span_offset_start + span.length
                words_contained = [
                    word
                    for word in page.words
                    if word.span.offset >= span_offset_start
                    and word.span.offset + word.span.length <= span_offset_end
                ]
                contained_words.extend(words_contained)

            contained_words_conf_scores = [
                word.confidence for word in contained_words
            ]

            di_line = DIDocumentLine(
                **line_copy,
                contained_words=contained_words,
                page_number=page_number,
                confidence=multiple_score_resolver(
                    contained_words_conf_scores
                ),
                normalized_polygon=normalize_polygon(
                    page, line_copy.polygon
                )
            )
            di_lines.append(di_line)
    return di_lines


def find_matching_lines(
    value: str,
    di_lines: list[DIDocumentLine],
    value_matcher: callable = value_match,
) -> list[DIDocumentLine]:
    """
    Find lines in the pre-computed di_lines that match a given value.

    Args:
        value: The value to match.
        di_lines: Precomputed list of DIDocumentLine instances.
        value_matcher: The function to use for matching values.

    Returns:
        list: The list of DIDocumentLine instances that match the given value.
    """
    if not value:
        return list()

    if not isinstance(value, str):
        value = str(value)

    matching_lines = [
        line for line in di_lines if value_matcher(value, line.content)
    ]

    # If no matching lines using the primary matcher, try secondary one.
    if not matching_lines:
        matching_lines = [
            line for line in di_lines if value_contains(value, line.content)
        ]

    return matching_lines


def get_field_confidence_score(
    scores: Iterable[float],
    default_score: Optional[float | int] = None,
    multiple_score_resolver: callable = min
) -> float:
    """
    Determines the field confidence score based on potentially multiple scores.

    Args:
        scores: The confidence scores for the field.
        default_score: The default confidence score to return if no scores are provided.
        multiple_score_resolver: The function to resolve multiple confidence scores.

    Returns:
        float: The field confidence score.
    """

    if len(scores) == 1:
        return scores[0]
    if len(scores) == 0:
        return default_score
    return multiple_score_resolver(scores)


def evaluate_confidence(
    extract_result: dict,
    analyze_result: AnalyzeResult
):
    """
    Evaluate the confidence of extracted fields based on the Azure AI Document Intelligence analysis result.

    Args:
        extract_result: The extracted fields to evaluate.
        analyze_result: The Azure AI Document Intelligence analysis result to evaluate against.

    Returns:
        dict: The confidence evaluation of the extracted fields.
    """

    di_lines = extract_lines(analyze_result, multiple_score_resolver=min)

    def evaluate_field_value_confidence(
        value: any,
    ) -> dict[str, any]:
        """
        Evaluate the confidence of a field value based on the Azure AI Document Intelligence analysis result.

        Args:
            value: The field value to evaluate.

        Returns:
            dict: The confidence evaluation of the field value.
        """

        if isinstance(value, dict):
            return {
                key: evaluate_field_value_confidence(val)
                for key, val in value.items()
            }
        elif isinstance(value, list):
            return [
                evaluate_field_value_confidence(item)
                for item in value
            ]
        else:
            matching_lines = find_matching_lines(
                value, di_lines, value_matcher=value_match)
            field_confidence_score = get_field_confidence_score(
                scores=[match.confidence for match in matching_lines],
                default_score=0.0,
                multiple_score_resolver=min
            )
            normalized_polygons = [
                line.normalized_polygon for line in matching_lines
            ]
            return {
                "confidence": field_confidence_score,
                "matching_lines": matching_lines,
                "normalized_polygons": normalized_polygons,
                "value": value
            }

    confidence = dict()

    # Process each field concurrently.
    with ThreadPoolExecutor() as executor:
        future_to_field = {
            executor.submit(evaluate_field_value_confidence, value): field
            for field, value in extract_result.items()
        }
        for future in as_completed(future_to_field):
            field = future_to_field[future]
            confidence[field] = future.result()

    confidence_scores = get_confidence_values(confidence)
    confidence[OVERALL_CONFIDENCE_KEY] = sum(
        confidence_scores) / len(confidence_scores) if confidence_scores else 0.0

    return confidence