From 2bb6a15180038a97e17147f38839154bc3fc379d Mon Sep 17 00:00:00 2001 From: psoubrie Date: Mon, 15 Apr 2024 17:05:20 +0200 Subject: [PATCH] own ocr --- environment.yml | Bin 13854 -> 0 bytes img2xlsx.ipynb | 201 +++++++++++++++++++++++++++++++++-------------- requirements.txt | 2 - 3 files changed, 140 insertions(+), 63 deletions(-) delete mode 100644 environment.yml delete mode 100644 requirements.txt diff --git a/environment.yml b/environment.yml deleted file mode 100644 index 1bb48ec606bb2978c35fa0fb934016ef14f310fb..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 13854 zcmche+fF0P6^8pdQr=prOhx~Z7eREB*&G$G2M~q9;;W;UP_Xo zsHWmDZPDIH`Ug>C&nrFgExm?m)rly1-Dv$2Q66QhHYo>s2Fv~o2c+FdUTmB5;T!ZH z>f51y=I=5cb;$H4IUbYM(8bokf~78}nJ!p$pl67&jX2Ep;AxrQyfLVcHP8@k|4+x1BNi4VMb*+JQqqHAsYb*i;z$*W^ozl;fDf2{S%n~ii^ zrTDnlXL$NepFYXTL>VultRGjXC)E!I22myiRiQVEW=%0){KYSLQXA(pWWcCBL< z+k{?+B!bhko}P*WUJh;AnUvQeYMa+4dywHuRf}~$vo;>NN%piR)M{+IR>ZhIMC3Z7 z_SeT*wUc*|2Wt=?k;D7xs5AGXoT@Jj zfnFPN9LT1#>W^gKt^Qi;x0T)pT8k$i-laTFnMD@gNNS?4tIo8!lZoz7Yw=npy~z$N z1_xX0T-;~+AxF~Mc*?QrY7LK@WbvV7N9M9p1`j-7KJM&}I`$yD#Zd^eA zd_)yFzt$7Tdz$hjDqz2ypKBdFc`ILUC8<4)4XEmE>@X^#)6^U3Nu-DM>U_ry;TmFPKX{uT6)8_A1pP z>ffCx$i!nwc%dhG7Bq-nO7sRwRap zlJ8PdI401Bc#4cdlXK|;vYkkmykgh<={C-Y<7Ab{%!U4f76b?~C@pFZpxKB_Bq1M& z%#moJBOkgb$%k#KWPxbvbxRzP*F)BQl+2)5tZ^f&u#?(|BYDif_y=j4NQ>+|m(3=6 zA9c!r2`FB#S({sjCop!0kEJt2VRLOVx z4J^~ zbC#hRfYNUjDa6TYEWk?w!#CN`1QO39-%|-m@p@ zKVRRPJYwSXc`h|Vz%Rsq@c2_Y5+6;ZNgoGYP2@qc`Aq+I`75|Sr>zTduWwJI6{+1Fg%x@IgMHmwGPF&Yk_DC*pNiZT4pH{YWd(LFlWWUF}z`uu%3; z_rPs^^k5djBocH0GU=yB6QfRKEoL=`HTs%|nrksz`n794NfV|9)E4Z>i9~kPiL8aEqbq2?%VrCo&CWc;YmKCqrCK95yWt& zTm@TjuJAJ}($$cquVwpcyG{7ic-zRmjU2k_|Dz&`*&dW&{uAj6>R^|DncnHD-AKzb zN-7}p5%9WE3zH^uIaQ^@cI?PM;HL}%Gi%aH%AvE40cYO zwQ;7zY^4jc`CKZb#|EY)2bd5uQ}_s-yy&|d)fIOD_EMGsE&9v9lZ4eASYx9ZG8jLQiD$}S z5Djw^5N^2!n1#+8=hRHKKr)VFV%XEpe1bv^7nW^~7CzWPe5$cYsZ|FeJpbwCgE)x2GX3Q9m*jjOnoIEIB#Mi(MH0- zkKO6hyFTQ*D-=}Z)i70@vQ`8w^_8cvKl((EDRVxLhMx7pCAP_ z^_?A;t7@!VR;k<%&~9W6rdyl>;+61Zv+A&aU~f%Sf?@M`jGfW=9nbp|4DIO=-SLoH zpoyl4VC?TYUC%k-j#KCGUq20yI?hn2GBrjpjy+Z_ zXiwZZ-&m&V@*)}8jngS=%hkDXnxcR%OP?+6$GsEu{hZbB6zkkxpdW$B8Ue$ub$?3uyfv2^8$RvG2BbRem~bXcfa z9U@D?@t}F8sCcrB&zjq{WaHMFt1Z#YzQDSfTdCKJC_JC1^M|=NISK0~4(DmE*?x2bx}*+L)9UkmupqfvOLSN}PUoo^%OsC!?I$D*jhJn6lZZ)GyGC>K z354jUVvBaqJ2(2@m(-i@bJc=I=m5QPt(EC zmOG+?TdC9+?jvw!SmtT$bvbA7?LeQovUbEc`jnk)qbTxu3Qzg0hFrihM)AJ6(9G=c zZhr5M9>ofWyt`5KPZhDADS!<6SMH56uRfKxhQu^}0p-!H$-Vadf~~#ijae@!`&IISI2n7Xjzj=j+|$CA4nA<(s}i@<+HS6bt z7}yyt*eim$=wMCXLBaFavWD-EQ9V4}FcCp%MouL?Ynf(%6r6^XE{G?h4xOP?)l^_ znbF7EzR`|!winorNbX~?ug=ciQa6uX7&= zYf4j&=JWfscCYz}2S|`Lv9{NBeIuK?ybB_#`#atQCqRRw^;EJcDRDri#{X=fibS8O zYg9?$$qIMh=uY0d87~vr-8N*UHr+rq94yFAD<*%NRMS8@p@XYwx6ZGP9Wfl@-t9Wk zM_uUFg$_c;=Cpx4n!no_D&w85HQyvJwkC6$QSq_vH3b&%Nh`Z`?tY`CF%D~SvSKgp Ll=uIS|E={uvD<*K diff --git a/img2xlsx.ipynb b/img2xlsx.ipynb index e105970..93cb411 100644 --- a/img2xlsx.ipynb +++ b/img2xlsx.ipynb @@ -2,48 +2,36 @@ "cells": [ { "cell_type": "code", - "execution_count": null, - "metadata": {}, + "execution_count": 1, + "metadata": { + "metadata": {} + }, "outputs": [], "source": [ - "from paddleocr import PaddleOCR\n", "import os\n", "import cv2\n", "import xlsxwriter\n", + "import numpy as np\n", "\n", "\n", - "def ocr_image(image_path, ocr):\n", - " result = ocr.ocr(image_path, cls=True)\n", - " return result[0][0][1][0]\n", + "def shear_img(image):\n", + " rows, cols, _ = image.shape\n", + " M = np.float32([[1, -0.22, 0], [0, 1, 0], [0, 0, 1]])\n", + " sheared_img = cv2.warpPerspective(image, M, (int(cols * 1.5), int(rows * 1.5)))\n", + " return cv2.flip(sheared_img, 1)\n", "\n", "\n", - "def cropUI(image_path):\n", - " # small opencv window to crop the image\n", - " image = cv2.imread(image_path)\n", - " r = cv2.selectROI(image)\n", - " cv2.destroyAllWindows()\n", - "\n", - " return r\n", - "\n", - "\n", - "def cropImage(image_path, r, flip, cropped_folder):\n", - " # crop the image and save it\n", - " image = cv2.imread(image_path)\n", - " cropped = image[int(r[1]) : int(r[1] + r[3]), int(r[0]) : int(r[0] + r[2])]\n", - "\n", - " # save with new name\n", - " if flip == \"y\":\n", - " cropped = cv2.flip(cropped, 1)\n", - "\n", - " # save in subfolder cropped\n", - " new_name = os.path.join(cropped_folder, image_path.split(\"/\")[-1])\n", - " cv2.imwrite(new_name, cropped)\n" + "def crop_image(sheared_image, r, target_path):\n", + " cropped = sheared_image[int(r[1]) : int(r[1] + r[3]), int(r[0]) : int(r[0] + r[2])]\n", + " return cropped" ] }, { "cell_type": "code", - "execution_count": null, - "metadata": {}, + "execution_count": 2, + "metadata": { + "metadata": {} + }, "outputs": [], "source": [ "from datetime import datetime\n", @@ -76,52 +64,139 @@ " pass\n", " row += 1\n", "\n", - " workbook.close()\n" + " workbook.close()" ] }, { "cell_type": "code", - "execution_count": null, - "metadata": {}, + "execution_count": 3, + "metadata": { + "metadata": {} + }, "outputs": [], "source": [ - "FOLDER = \"data/\"\n", - "# ocr settings\n", - "os.environ[\"KMP_DUPLICATE_LIB_OK\"] = \"TRUE\"\n", - "ocr = PaddleOCR(use_angle_cls=True, lang=\"en\")\n", + "def peaks(data, boxes=2):\n", + " # split data in boxes\n", + " data = np.array_split(data, boxes)\n", "\n", - "# *** start GUIs ***\n", - "images = [f for f in os.listdir(FOLDER) if f.endswith(\".jpg\")]\n", - "region = cropUI(os.path.join(FOLDER, images[0]))\n", + " # get the maximum value in each box\n", + " data = [np.max(d) for d in data]\n", + " data = [d > 150 for d in data]\n", + " return np.array(data).astype(int).tolist()\n", "\n", - "flip = input(\"Do you want to flip the images horizontaly? (y/n): \")\n", "\n", - "cropped_folder = os.path.join(FOLDER, \"cropped\")\n", - "if not os.path.exists(cropped_folder):\n", - " os.makedirs(cropped_folder)\n", + "digits = {\n", + " 0: [[1, 0, 1], [1, 1], [1, 1]],\n", + " 1: [[0, 0, 0], [0, 1], [0, 1]],\n", + " 2: [[1, 1, 1], [0, 1], [1, 0]],\n", + " 3: [[1, 1, 1], [0, 1], [0, 1]],\n", + " 4: [[0, 1, 0], [1, 1], [0, 1]],\n", + " 5: [[1, 1, 1], [1, 0], [0, 1]],\n", + " 6: [[1, 1, 1], [1, 0], [1, 1]],\n", + " 7: [[1, 0, 0], [0, 1], [0, 1]],\n", + " 8: [[1, 1, 1], [1, 1], [1, 1]],\n", + " 9: [[1, 1, 1], [1, 1], [0, 1]],\n", + "}\n", "\n", - "# *** start cropping ***\n", - "for image in images:\n", - " cropImage(os.path.join(FOLDER, image), region, flip, cropped_folder)\n", "\n", - "# *** start OCR ***\n", - "cropped_images = [f for f in os.listdir(cropped_folder) if f.endswith(\".jpg\")]\n", - "data = {}\n", - "for image in cropped_images:\n", - " try:\n", - " path = os.path.join(cropped_folder, image)\n", - " text = ocr_image(path, ocr)\n", - " except Exception as _:\n", - " print(\"Error in cropped image\")\n", - " continue\n", + "def ownOCR(image):\n", + " # get vertical pixel line in the middle of the image\n", + " vertical = image[:, image.shape[1] // 2, 0]\n", "\n", - " data[image] = text\n" + " # get two horizontal lines at 1/3 and 2/3 of the image\n", + " horizontal1 = image[image.shape[0] // 3, :, 0]\n", + " horizontal2 = image[2 * image.shape[0] // 3, :, 0]\n", + "\n", + " # get times it goes above 150, remove subsequent values\n", + " digit = [peaks(vertical, 3), peaks(horizontal1), peaks(horizontal2)]\n", + " digit = [key for key, value in digits.items() if value == digit]\n", + " return digit[0]" ] }, { "cell_type": "code", - "execution_count": null, - "metadata": {}, + "execution_count": 4, + "metadata": { + "metadata": {} + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "list index out of range\n", + "list index out of range\n", + "list index out of range\n", + "list index out of range\n", + "list index out of range\n", + "list index out of range\n", + "list index out of range\n", + "list index out of range\n", + "list index out of range\n", + "list index out of range\n", + "list index out of range\n", + "list index out of range\n", + "list index out of range\n", + "list index out of range\n", + "list index out of range\n", + "list index out of range\n", + "list index out of range\n", + "list index out of range\n", + "list index out of range\n", + "list index out of range\n", + "list index out of range\n", + "list index out of range\n", + "list index out of range\n", + "list index out of range\n" + ] + } + ], + "source": [ + "folder = \"../../../Downloads/wetransfer_metingen-8-04-tot-15-40-5min_2024-04-15_0905/metingen 8-04 tot 15-40 (5min)/\"\n", + "images = [f for f in os.listdir(folder) if f.endswith(\".jpg\")]\n", + "cropped_folder = os.path.join(folder, \"cropped\")\n", + "os.makedirs(cropped_folder, exist_ok=True)\n", + "\n", + "# Initial image for ROI selection\n", + "init_image_path = os.path.join(folder, images[0])\n", + "init_image = cv2.imread(init_image_path)\n", + "sheared_init_image = shear_img(init_image)\n", + "\n", + "regions = []\n", + "for i in range(5): # Assume 5 regions as in your original code\n", + " r = cv2.selectROI(f\"Select the digit {i+1}\", sheared_init_image)\n", + " cv2.destroyAllWindows()\n", + " regions.append(r)\n", + "\n", + "data = {}\n", + "i = 0\n", + "# Process all images\n", + "for image_name in images:\n", + " try:\n", + " image_path = os.path.join(folder, image_name)\n", + " image = cv2.imread(image_path)\n", + " sheared_image = shear_img(image)\n", + "\n", + " for idx, region in enumerate(regions):\n", + " target_path = os.path.join(cropped_folder, f\"{image_name[:-4]}_{idx+1}.jpg\")\n", + " cropped = crop_image(sheared_image, region, target_path)\n", + " digit = ownOCR(cropped)\n", + "\n", + " if image_name not in data:\n", + " data[image_name] = \"\"\n", + "\n", + " data[image_name] += str(digit)\n", + " except Exception as e:\n", + " print(e)\n", + " continue" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": { + "metadata": {} + }, "outputs": [], "source": [ "data2excel(data)" @@ -130,7 +205,9 @@ { "cell_type": "code", "execution_count": 12, - "metadata": {}, + "metadata": { + "metadata": {} + }, "outputs": [], "source": [ "# clean excel file\n", @@ -146,9 +223,10 @@ " row = 0\n", " for key, value in data.items():\n", " try:\n", + " value = \"6\"+value[1:]\n", " worksheet.write(row, 2, float(value[:6]))\n", " date = key.split(\" \", 1)[1][:-4]\n", - " writeDate(worksheet, row, 0, date, dateFormat)\n", + " writeDate(worksheet, row, 0, date, dateFormat) \n", " worksheet.write(row, 1, value)\n", " except Exception as _:\n", " continue\n", @@ -157,6 +235,7 @@ "\n", " workbook.close()\n", "\n", + "\n", "data2cleanexcel(data)" ] } diff --git a/requirements.txt b/requirements.txt deleted file mode 100644 index f6cbdea..0000000 --- a/requirements.txt +++ /dev/null @@ -1,2 +0,0 @@ -paddlepaddle==2.6.0 -paddleocr>=2.0.1 \ No newline at end of file