In [None]:
from paddleocr import PaddleOCR
import os
import cv2
import xlsxwriter


def ocr_image(image_path, ocr):
    result = ocr.ocr(image_path, cls=True)
    return result[0][0][1][0]


def cropUI(image_path):
    # small opencv window to crop the image
    image = cv2.imread(image_path)
    r = cv2.selectROI(image)
    cv2.destroyAllWindows()

    return r


def cropImage(image_path, r, flip, cropped_folder):
    # crop the image and save it
    image = cv2.imread(image_path)
    cropped = image[int(r[1]) : int(r[1] + r[3]), int(r[0]) : int(r[0] + r[2])]

    # save with new name
    if flip == "y":
        cropped = cv2.flip(cropped, 1)

    # save in subfolder cropped
    new_name = os.path.join(cropped_folder, image_path.split("/")[-1])
    cv2.imwrite(new_name, cropped)


In [None]:
from datetime import datetime


def writeDate(worksheet, row, column, date, format):
    original_format = "%Y-%m-%d %H-%M-%S-%f"
    parsed_datetime = datetime.strptime(date, original_format)

    worksheet.write_datetime(row, column, parsed_datetime, format)


def data2excel(data):
    # save the data in an excel file
    fileName = "data.xlsx"
    workbook = xlsxwriter.Workbook(fileName)
    worksheet = workbook.add_worksheet()

    dateFormat = workbook.add_format({"num_format": "dd/mm/yy hh:mm:ss"})

    # write the data
    row = 0
    for key, value in data.items():
        date = key.split(" ", 1)[1][:-4]
        writeDate(worksheet, row, 0, date, dateFormat)
        worksheet.write(row, 1, value)
        try:
            worksheet.write(row, 2, float(value[:6]))
        except Exception as _:
            pass
        row += 1

    workbook.close()


In [None]:
FOLDER = "data/"
# ocr settings
os.environ["KMP_DUPLICATE_LIB_OK"] = "TRUE"
ocr = PaddleOCR(use_angle_cls=True, lang="en")

# *** start GUIs ***
images = [f for f in os.listdir(FOLDER) if f.endswith(".jpg")]
region = cropUI(os.path.join(FOLDER, images[0]))

flip = input("Do you want to flip the images horizontaly? (y/n): ")

cropped_folder = os.path.join(FOLDER, "cropped")
if not os.path.exists(cropped_folder):
    os.makedirs(cropped_folder)

# *** start cropping ***
for image in images:
    cropImage(os.path.join(FOLDER, image), region, flip, cropped_folder)

# *** start OCR ***
cropped_images = [f for f in os.listdir(cropped_folder) if f.endswith(".jpg")]
data = {}
for image in cropped_images:
    try:
        path = os.path.join(cropped_folder, image)
        text = ocr_image(path, ocr)
    except Exception as _:
        print("Error in cropped image")
        continue

    data[image] = text


In [None]:
data2excel(data)

In [12]:
# clean excel file
def data2cleanexcel(data):
    # save the data in an excel file
    fileName = "data_clean.xlsx"
    workbook = xlsxwriter.Workbook(fileName)
    worksheet = workbook.add_worksheet()

    dateFormat = workbook.add_format({"num_format": "dd/mm/yy hh:mm:ss"})

    # write the data
    row = 0
    for key, value in data.items():
        try:
            worksheet.write(row, 2, float(value[:6]))
            date = key.split(" ", 1)[1][:-4]
            writeDate(worksheet, row, 0, date, dateFormat)
            worksheet.write(row, 1, value)
        except Exception as _:
            continue

        row += 1

    workbook.close()

data2cleanexcel(data)