{ "cells": [ { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "from paddleocr import PaddleOCR\n", "import os\n", "import cv2\n", "import xlsxwriter\n", "\n", "\n", "def ocr_image(image_path, ocr):\n", " result = ocr.ocr(image_path, cls=True)\n", " return result[0][0][1][0]\n", "\n", "\n", "def cropUI(image_path):\n", " # small opencv window to crop the image\n", " image = cv2.imread(image_path)\n", " r = cv2.selectROI(image)\n", " cv2.destroyAllWindows()\n", "\n", " return r\n", "\n", "\n", "def cropImage(image_path, r, flip, cropped_folder):\n", " # crop the image and save it\n", " image = cv2.imread(image_path)\n", " cropped = image[int(r[1]) : int(r[1] + r[3]), int(r[0]) : int(r[0] + r[2])]\n", "\n", " # save with new name\n", " if flip == \"y\":\n", " cropped = cv2.flip(cropped, 1)\n", "\n", " # save in subfolder cropped\n", " new_name = os.path.join(cropped_folder, image_path.split(\"/\")[-1])\n", " cv2.imwrite(new_name, cropped)\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "from datetime import datetime\n", "\n", "\n", "def writeDate(worksheet, row, column, date, format):\n", " original_format = \"%Y-%m-%d %H-%M-%S-%f\"\n", " parsed_datetime = datetime.strptime(date, original_format)\n", "\n", " worksheet.write_datetime(row, column, parsed_datetime, format)\n", "\n", "\n", "def data2excel(data):\n", " # save the data in an excel file\n", " fileName = \"data.xlsx\"\n", " workbook = xlsxwriter.Workbook(fileName)\n", " worksheet = workbook.add_worksheet()\n", "\n", " dateFormat = workbook.add_format({\"num_format\": \"dd/mm/yy hh:mm:ss\"})\n", "\n", " # write the data\n", " row = 0\n", " for key, value in data.items():\n", " date = key.split(\" \", 1)[1][:-4]\n", " writeDate(worksheet, row, 0, date, dateFormat)\n", " worksheet.write(row, 1, value)\n", " try:\n", " worksheet.write(row, 2, float(value[:6]))\n", " except Exception as _:\n", " pass\n", " row += 1\n", "\n", " workbook.close()\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "FOLDER = \"data/\"\n", "# ocr settings\n", "os.environ[\"KMP_DUPLICATE_LIB_OK\"] = \"TRUE\"\n", "ocr = PaddleOCR(use_angle_cls=True, lang=\"en\")\n", "\n", "# *** start GUIs ***\n", "images = [f for f in os.listdir(FOLDER) if f.endswith(\".jpg\")]\n", "region = cropUI(os.path.join(FOLDER, images[0]))\n", "\n", "flip = input(\"Do you want to flip the images horizontaly? (y/n): \")\n", "\n", "cropped_folder = os.path.join(FOLDER, \"cropped\")\n", "if not os.path.exists(cropped_folder):\n", " os.makedirs(cropped_folder)\n", "\n", "# *** start cropping ***\n", "for image in images:\n", " cropImage(os.path.join(FOLDER, image), region, flip, cropped_folder)\n", "\n", "# *** start OCR ***\n", "cropped_images = [f for f in os.listdir(cropped_folder) if f.endswith(\".jpg\")]\n", "data = {}\n", "for image in cropped_images:\n", " try:\n", " path = os.path.join(cropped_folder, image)\n", " text = ocr_image(path, ocr)\n", " except Exception as _:\n", " print(\"Error in cropped image\")\n", " continue\n", "\n", " data[image] = text\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "data2excel(data)" ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [], "source": [ "# clean excel file\n", "def data2cleanexcel(data):\n", " # save the data in an excel file\n", " fileName = \"data_clean.xlsx\"\n", " workbook = xlsxwriter.Workbook(fileName)\n", " worksheet = workbook.add_worksheet()\n", "\n", " dateFormat = workbook.add_format({\"num_format\": \"dd/mm/yy hh:mm:ss\"})\n", "\n", " # write the data\n", " row = 0\n", " for key, value in data.items():\n", " try:\n", " worksheet.write(row, 2, float(value[:6]))\n", " date = key.split(\" \", 1)[1][:-4]\n", " writeDate(worksheet, row, 0, date, dateFormat)\n", " worksheet.write(row, 1, value)\n", " except Exception as _:\n", " continue\n", "\n", " row += 1\n", "\n", " workbook.close()\n", "\n", "data2cleanexcel(data)" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.10.14" } }, "nbformat": 4, "nbformat_minor": 2 }