AI_logging/img2xlsx.ipynb

186 lines
5.3 KiB
Plaintext

{
"cells": [
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from paddleocr import PaddleOCR\n",
"import os\n",
"import cv2\n",
"import xlsxwriter\n",
"\n",
"\n",
"def ocr_image(image_path, ocr):\n",
" result = ocr.ocr(image_path, cls=True)\n",
" return result[0][0][1][0]\n",
"\n",
"\n",
"def cropUI(image_path):\n",
" # small opencv window to crop the image\n",
" image = cv2.imread(image_path)\n",
" r = cv2.selectROI(image)\n",
" cv2.destroyAllWindows()\n",
"\n",
" return r\n",
"\n",
"\n",
"def cropImage(image_path, r, flip, cropped_folder):\n",
" # crop the image and save it\n",
" image = cv2.imread(image_path)\n",
" cropped = image[int(r[1]) : int(r[1] + r[3]), int(r[0]) : int(r[0] + r[2])]\n",
"\n",
" # save with new name\n",
" if flip == \"y\":\n",
" cropped = cv2.flip(cropped, 1)\n",
"\n",
" # save in subfolder cropped\n",
" new_name = os.path.join(cropped_folder, image_path.split(\"/\")[-1])\n",
" cv2.imwrite(new_name, cropped)\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from datetime import datetime\n",
"\n",
"\n",
"def writeDate(worksheet, row, column, date, format):\n",
" original_format = \"%Y-%m-%d %H-%M-%S-%f\"\n",
" parsed_datetime = datetime.strptime(date, original_format)\n",
"\n",
" worksheet.write_datetime(row, column, parsed_datetime, format)\n",
"\n",
"\n",
"def data2excel(data):\n",
" # save the data in an excel file\n",
" fileName = \"data.xlsx\"\n",
" workbook = xlsxwriter.Workbook(fileName)\n",
" worksheet = workbook.add_worksheet()\n",
"\n",
" dateFormat = workbook.add_format({\"num_format\": \"dd/mm/yy hh:mm:ss\"})\n",
"\n",
" # write the data\n",
" row = 0\n",
" for key, value in data.items():\n",
" date = key.split(\" \", 1)[1][:-4]\n",
" writeDate(worksheet, row, 0, date, dateFormat)\n",
" worksheet.write(row, 1, value)\n",
" try:\n",
" worksheet.write(row, 2, float(value[:6]))\n",
" except Exception as _:\n",
" pass\n",
" row += 1\n",
"\n",
" workbook.close()\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"FOLDER = \"data/\"\n",
"# ocr settings\n",
"os.environ[\"KMP_DUPLICATE_LIB_OK\"] = \"TRUE\"\n",
"ocr = PaddleOCR(use_angle_cls=True, lang=\"en\")\n",
"\n",
"# *** start GUIs ***\n",
"images = [f for f in os.listdir(FOLDER) if f.endswith(\".jpg\")]\n",
"region = cropUI(os.path.join(FOLDER, images[0]))\n",
"\n",
"flip = input(\"Do you want to flip the images horizontaly? (y/n): \")\n",
"\n",
"cropped_folder = os.path.join(FOLDER, \"cropped\")\n",
"if not os.path.exists(cropped_folder):\n",
" os.makedirs(cropped_folder)\n",
"\n",
"# *** start cropping ***\n",
"for image in images:\n",
" cropImage(os.path.join(FOLDER, image), region, flip, cropped_folder)\n",
"\n",
"# *** start OCR ***\n",
"cropped_images = [f for f in os.listdir(cropped_folder) if f.endswith(\".jpg\")]\n",
"data = {}\n",
"for image in cropped_images:\n",
" try:\n",
" path = os.path.join(cropped_folder, image)\n",
" text = ocr_image(path, ocr)\n",
" except Exception as _:\n",
" print(\"Error in cropped image\")\n",
" continue\n",
"\n",
" data[image] = text\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"data2excel(data)"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {},
"outputs": [],
"source": [
"# clean excel file\n",
"def data2cleanexcel(data):\n",
" # save the data in an excel file\n",
" fileName = \"data_clean.xlsx\"\n",
" workbook = xlsxwriter.Workbook(fileName)\n",
" worksheet = workbook.add_worksheet()\n",
"\n",
" dateFormat = workbook.add_format({\"num_format\": \"dd/mm/yy hh:mm:ss\"})\n",
"\n",
" # write the data\n",
" row = 0\n",
" for key, value in data.items():\n",
" try:\n",
" worksheet.write(row, 2, float(value[:6]))\n",
" date = key.split(\" \", 1)[1][:-4]\n",
" writeDate(worksheet, row, 0, date, dateFormat)\n",
" worksheet.write(row, 1, value)\n",
" except Exception as _:\n",
" continue\n",
"\n",
" row += 1\n",
"\n",
" workbook.close()\n",
"\n",
"data2cleanexcel(data)"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.14"
}
},
"nbformat": 4,
"nbformat_minor": 2
}