Merge pull request #6 from anisdismail/master

anisdismail · web-flow · commit 0bfad6c10123 · 2021-05-03T21:47:35.000+02:00
add-ocr-endpoints
diff --git a/README.md b/README.md
@@ -6,6 +6,7 @@ This repo is based on [Tensorflow Object Detection API](https://github.com/tenso
 The Tensorflow version used is 1.13.1. The inference REST API works on CPU and doesn't require any GPU usage. It's supported on both Windows and Linux Operating systems.
 
 Models trained using our training tensorflow repository can be deployed in this API. Several object detection models can be loaded and used at the same time.
+This repo also offers optical character recognition services to extract textboxes from images.
 
 This repo can be deployed using either **docker** or **docker swarm**.
 
@@ -153,8 +154,17 @@ Returns the specified model's configuration
 
 Performs inference on specified model and a list of images, and returns bounding boxes
 
-**P.S: Custom endpoints like /load, /detect, and /get_labels should be used in a chronological order. First you have to call /load, and then call /detect or /get_labels**
+#### /models/{model_name}/one_shot_ocr (POST)
 
+Takes an image and returns extracted text details. In first place a detection model will be used for cropping interesting areas in the uploaded image. Then, these areas will be passed to the OCR-Service for text extraction.
+
+#### /models/{model_name}/ocr (POST)
+
+![predict image](./docs/5.gif)
+
+Takes an image and returns extracted text details without using an object detection model
+
+**P.S: Custom endpoints like /load, /detect, /get_labels and /one_shot_ocr should be used in a chronological order. First you have to call /load, and then call /detect, /get_labels or /one_shot_ocr
 ## Model structure
 
 The folder "models" contains subfolders of all the models to be loaded.
@@ -257,3 +267,5 @@ Inside each subfolder there should be a:
 Joe Sleiman, inmind.ai , Beirut, Lebanon
 
 Antoine Charbel, inmind.ai, Beirut, Lebanon
+
+[Anis Ismail](https://www.linkedin.com/in/anisdismail), Lebanese American University, Beirut, Lebanon
diff --git a/docker/dockerfile b/docker/dockerfile
@@ -5,6 +5,8 @@ LABEL maintainer="antoine.charbel@inmind.ai"
 COPY docker/requirements.txt .
 COPY src/main /main
 
+RUN apt-get update && apt-get install -y tesseract-ocr
+
 RUN pip install -r requirements.txt
 
 WORKDIR /main
diff --git a/docker/requirements.txt b/docker/requirements.txt
@@ -16,8 +16,8 @@ socketIO-client-nexus
 tensorflow==1.13.1
 uvicorn
 jsonschema
-
-
+pytz
+pytesseract
 
 
 
diff --git a/docs/5.gif b/docs/5.gif
diff --git a/src/main/ocr.py b/src/main/ocr.py
@@ -0,0 +1,86 @@
+import pytesseract
+import unicodedata
+import re
+import numpy as np
+
+
+
+# Define class variables
+
+bounding_box_order = ["left", "top", "right", "bottom"]
+
+# This method will take the model bounding box predictions and return the extracted text inside each box
+def one_shot_ocr_service(image, output):
+    # iterate over detections
+    response = []
+    detections = output['bounding-boxes']
+
+    for i in range(0, len(detections)):
+
+        # crop image for every detection:
+        coordinates = (detections[i]["coordinates"])
+        cropped = image.crop((float(coordinates["left"]), float(
+            coordinates["top"]), float(coordinates["right"]), float(coordinates["bottom"])))
+
+        # convert image to grayscale for better accuracy 
+        processed_img=cropped.convert('L')
+        
+        # extract text with positive confidence from cropped image
+        df = pytesseract.image_to_data(processed_img, output_type='data.frame')
+        valid_df = df[df["conf"] > 0]
+        extracted_text = " ".join(valid_df["text"].values)
+
+        # process text
+        extracted_text = str(unicodedata.normalize('NFKD', extracted_text).encode('ascii', 'ignore').decode()).strip().replace("\n", " ").replace(
+            "...", ".").replace("..", ".").replace('”', ' ').replace('“', ' ').replace("'", ' ').replace('\"', '').replace("alt/1m", "").strip()
+        extracted_text = re.sub(
+            '[^A-Za-z0-9.!?,;%:=()\[\]$€&/\- ]+', '', extracted_text)
+        extracted_text = " ".join(extracted_text.split())
+
+        # wrap each prediction inside a dictionary
+        if len(extracted_text) is not 0:
+            prediction = dict()
+            prediction["text"] = extracted_text
+            bounding_box = [coordinates[el] for el in bounding_box_order]
+            prediction["box"] = bounding_box
+            prediction["score"] = valid_df["conf"].mean()/100.0
+
+            response.append(prediction)
+
+    return response
+
+# This method will take an image and return the extracted text from the image
+def ocr_service(image):
+    # convert image to grayscale for better accuracy 
+    processed_img=image.convert('L')
+
+    # Get data including boxes, confidences, line and page numbers
+    df = pytesseract.image_to_data(processed_img, output_type='data.frame')
+    valid_df = df[df["conf"] > 0]
+
+    # process text
+    extracted_text = " ".join(valid_df["text"].values)
+    extracted_text = str(unicodedata.normalize('NFKD', extracted_text).encode('ascii', 'ignore').decode()).strip().replace("\n", " ").replace(
+        "...", ".").replace("..", ".").replace('”', ' ').replace('“', ' ').replace("'", ' ').replace('\"', '').replace("alt/1m", "").strip()
+    extracted_text = re.sub(
+        '[^A-Za-z0-9.!?,;%:=()\[\]$€&/\- ]+', '', extracted_text)
+    extracted_text = " ".join(extracted_text.split())
+
+    # calculate the bounding box data based on pytesseract results
+    coordinates = {}
+    index = valid_df.index.values
+    coordinates["left"] = valid_df.loc[index[0], "left"]
+    coordinates["top"] = valid_df.loc[index[0], "top"]
+    coordinates["bottom"] = valid_df.loc[index[-1],
+                                         "top"] + valid_df.loc[index[-1], "height"]
+    coordinates["right"] = valid_df.loc[index[-1],
+                                        "left"] + valid_df.loc[index[-1], "width"]
+    bounding_box = [coordinates[el].item() for el in bounding_box_order]
+
+    # wrap each prediction inside a dictionary
+    response = {}
+    response["text"] = extracted_text
+    response["box"] = bounding_box
+    response["score"] = valid_df["conf"].mean()/100.0
+
+    return [response]
diff --git a/src/main/start.py b/src/main/start.py
@@ -6,13 +6,18 @@
 from starlette.staticfiles import StaticFiles
 from starlette.middleware.cors import CORSMiddleware
 from deep_learning_service import DeepLearningService
-from fastapi import FastAPI, Form, File, UploadFile, Header
+from fastapi import FastAPI, Form, File, UploadFile, Header, HTTPException
 from inference.exceptions import ModelNotFound, InvalidModelConfiguration, ApplicationError, ModelNotLoaded, \
-	InferenceEngineNotFound, InvalidInputData
-
+    InferenceEngineNotFound, InvalidInputData
+from ocr import ocr_service, one_shot_ocr_service
+from datetime import datetime
+import pytz
+from PIL import Image
 
 sys.path.append('./inference')
 
+tz = pytz.timezone("Europe/Berlin")
+
 dl_service = DeepLearningService()
 error_logging = Error()
 app = FastAPI(version='1.0', title='BMW InnovationLab tensorflow cpu inference Automation',
@@ -185,3 +190,78 @@ async def list_model_config(model_name: str):
 	"""
 	config = dl_service.get_config(model_name)
 	return ApiResponse(data=config)
+
+
+@app.post('/models/{model_name}/one_shot_ocr')
+async def one_shot_ocr(
+    model_name: str,
+    image: UploadFile = File(
+        ..., description="Image to perform optical character recognition based on layout inference:")
+):
+    """
+        Takes an image and returns extracted text details.
+
+        In first place a detection model will be used for cropping interesting areas in the uploaded image. These areas will then be passed to the OCR-Service for text extraction.
+
+        :param model_name: Model name or model hash for layout detection
+
+        :param image: Image file
+
+        :return: Text fields with the detected files inside
+
+    """
+    output = None
+    # call detection on image with choosen model
+    try:
+        output = await run_model(model_name, image)
+    except:
+        raise HTTPException(status_code=404, detail='Invalid Model')
+
+    # run ocr_service
+    response = None
+    try:
+        image = Image.open(image.file).convert('RGB')
+        response = one_shot_ocr_service(image, output.data)
+    except:
+        raise HTTPException(
+            status_code=500, detail='Unexpected Error during Inference (Determination of Texts)')
+
+    if not response:
+        raise HTTPException(
+            status_code=400, detail='Inference (Determination of Texts) is not Possible with the Specified Model')
+
+    return response
+
+
+@app.post('/models/{model_name}/ocr')
+async def optical_character_recognition(
+    model_name: str,
+    image: UploadFile = File(
+        ..., description="Image to perform optical character recognition based on layout inference:"),
+):
+    """
+        Takes an image and returns extracted text informations.
+
+        The image is passed to the OCR-Service for text extraction
+
+        :param model: Model name or model hash
+
+        :param image: Image file
+
+        :return: Text fields with the detected files inside
+
+    """
+    # run ocr_service
+    response = None
+    try:
+        image = Image.open(image.file).convert('RGB')
+        response = ocr_service(image)
+    except:
+        raise HTTPException(
+            status_code=500, detail='Unexpected Error during Inference (Determination of Texts)')
+
+    if not response:
+        raise HTTPException(
+            status_code=400, detail='Inference (Determination of Texts) is not Possible with the Specified Model')
+
+    return response