|
| 1 | +import pytesseract |
| 2 | +import unicodedata |
| 3 | +import re |
| 4 | +import numpy as np |
| 5 | + |
| 6 | + |
| 7 | + |
| 8 | +# Define class variables |
| 9 | + |
| 10 | +bounding_box_order = ["left", "top", "right", "bottom"] |
| 11 | + |
| 12 | +# This method will take the model bounding box predictions and return the extracted text inside each box |
| 13 | +def one_shot_ocr_service(image, output): |
| 14 | + # iterate over detections |
| 15 | + response = [] |
| 16 | + detections = output['bounding-boxes'] |
| 17 | + |
| 18 | + for i in range(0, len(detections)): |
| 19 | + |
| 20 | + # crop image for every detection: |
| 21 | + coordinates = (detections[i]["coordinates"]) |
| 22 | + cropped = image.crop((float(coordinates["left"]), float( |
| 23 | + coordinates["top"]), float(coordinates["right"]), float(coordinates["bottom"]))) |
| 24 | + |
| 25 | + # convert image to grayscale for better accuracy |
| 26 | + processed_img=cropped.convert('L') |
| 27 | + |
| 28 | + # extract text with positive confidence from cropped image |
| 29 | + df = pytesseract.image_to_data(processed_img, output_type='data.frame') |
| 30 | + valid_df = df[df["conf"] > 0] |
| 31 | + extracted_text = " ".join(valid_df["text"].values) |
| 32 | + |
| 33 | + # process text |
| 34 | + extracted_text = str(unicodedata.normalize('NFKD', extracted_text).encode('ascii', 'ignore').decode()).strip().replace("\n", " ").replace( |
| 35 | + "...", ".").replace("..", ".").replace('”', ' ').replace('“', ' ').replace("'", ' ').replace('\"', '').replace("alt/1m", "").strip() |
| 36 | + extracted_text = re.sub( |
| 37 | + '[^A-Za-z0-9.!?,;%:=()\[\]$€&/\- ]+', '', extracted_text) |
| 38 | + extracted_text = " ".join(extracted_text.split()) |
| 39 | + |
| 40 | + # wrap each prediction inside a dictionary |
| 41 | + if len(extracted_text) is not 0: |
| 42 | + prediction = dict() |
| 43 | + prediction["text"] = extracted_text |
| 44 | + bounding_box = [coordinates[el] for el in bounding_box_order] |
| 45 | + prediction["box"] = bounding_box |
| 46 | + prediction["score"] = valid_df["conf"].mean()/100.0 |
| 47 | + |
| 48 | + response.append(prediction) |
| 49 | + |
| 50 | + return response |
| 51 | + |
| 52 | +# This method will take an image and return the extracted text from the image |
| 53 | +def ocr_service(image): |
| 54 | + # convert image to grayscale for better accuracy |
| 55 | + processed_img=image.convert('L') |
| 56 | + |
| 57 | + # Get data including boxes, confidences, line and page numbers |
| 58 | + df = pytesseract.image_to_data(processed_img, output_type='data.frame') |
| 59 | + valid_df = df[df["conf"] > 0] |
| 60 | + |
| 61 | + # process text |
| 62 | + extracted_text = " ".join(valid_df["text"].values) |
| 63 | + extracted_text = str(unicodedata.normalize('NFKD', extracted_text).encode('ascii', 'ignore').decode()).strip().replace("\n", " ").replace( |
| 64 | + "...", ".").replace("..", ".").replace('”', ' ').replace('“', ' ').replace("'", ' ').replace('\"', '').replace("alt/1m", "").strip() |
| 65 | + extracted_text = re.sub( |
| 66 | + '[^A-Za-z0-9.!?,;%:=()\[\]$€&/\- ]+', '', extracted_text) |
| 67 | + extracted_text = " ".join(extracted_text.split()) |
| 68 | + |
| 69 | + # calculate the bounding box data based on pytesseract results |
| 70 | + coordinates = {} |
| 71 | + index = valid_df.index.values |
| 72 | + coordinates["left"] = valid_df.loc[index[0], "left"] |
| 73 | + coordinates["top"] = valid_df.loc[index[0], "top"] |
| 74 | + coordinates["bottom"] = valid_df.loc[index[-1], |
| 75 | + "top"] + valid_df.loc[index[-1], "height"] |
| 76 | + coordinates["right"] = valid_df.loc[index[-1], |
| 77 | + "left"] + valid_df.loc[index[-1], "width"] |
| 78 | + bounding_box = [coordinates[el].item() for el in bounding_box_order] |
| 79 | + |
| 80 | + # wrap each prediction inside a dictionary |
| 81 | + response = {} |
| 82 | + response["text"] = extracted_text |
| 83 | + response["box"] = bounding_box |
| 84 | + response["score"] = valid_df["conf"].mean()/100.0 |
| 85 | + |
| 86 | + return [response] |
0 commit comments