Spaces:
Running
Running
| import os | |
| import pandas as pd | |
| import os | |
| os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = "./audit-edge-cloud-products-6eb51d237f19.json" | |
| def run_tesseract_on_image(image_path): # -> tsv output path | |
| print("image_path",image_path) | |
| image_name = os.path.basename(image_path) | |
| image_name = image_name[:image_name.find('.')] | |
| error_code = os.system(f''' | |
| tesseract "{image_path}" "/content/{image_name}" -l eng tsv | |
| ''') | |
| if not error_code: | |
| return f"/content/{image_name}.tsv" | |
| else: | |
| raise ValueError('Tesseract OCR Error please verify image format PNG,JPG,JPEG') | |
| def clean_tesseract_output(tsv_output_path): | |
| print("tsv_output_path",tsv_output_path) | |
| ocr_df = pd.read_csv(tsv_output_path, sep='\t') | |
| ocr_df = ocr_df.dropna() | |
| ocr_df = ocr_df.drop(ocr_df[ocr_df.text.str.strip() == ''].index) | |
| text_output = ' '.join(ocr_df.text.tolist()) | |
| words = [] | |
| for index, row in ocr_df.iterrows(): | |
| word = {} | |
| origin_box = [row['left'], row['top'], row['left'] + | |
| row['width'], row['top']+row['height']] | |
| word['word_text'] = row['text'] | |
| word['word_box'] = origin_box | |
| words.append(word) | |
| return words | |
| def detect_text(path): | |
| print("this is path:",path) | |
| """Detects text in the file.""" | |
| from google.cloud import vision | |
| client = vision.ImageAnnotatorClient() | |
| with open(path, "rb") as image_file: | |
| content = image_file.read() | |
| image = vision.Image(content=content) | |
| response = client.text_detection(image=image) | |
| texts = response.text_annotations | |
| print("Texts:") | |
| list_of_dict = [] | |
| for text in texts[1:]: | |
| data_dic = {} | |
| print(f'\n"{text.description}"') | |
| data_dic["word_text"] = text.description | |
| vertices_list = [[int(vertex.x),int(vertex.y)] for vertex in text.bounding_poly.vertices] | |
| print("vertices_list",vertices_list) | |
| coords = vertices_list | |
| sorted_coords = sorted(coords, key=lambda coord: (coord[0] + coord[1])) | |
| # Top-left is the first in the sorted list (smallest sum of x, y) | |
| top_left = sorted_coords[0] | |
| # Bottom-right is the last in the sorted list (largest sum of x, y) | |
| bottom_right = sorted_coords[-1] | |
| ls = [] | |
| ls.append(top_left[0]) | |
| ls.append(top_left[1]) | |
| ls.append(bottom_right[0]) | |
| ls.append(bottom_right[1]) | |
| # print(ls) | |
| # ls = [] | |
| # ls.append(vertices_list[0][0]) | |
| # ls.append(vertices_list[0][1]) | |
| # ls.append(vertices_list[2][0]) | |
| # ls.append(vertices_list[2][1]) | |
| data_dic["word_box"] = ls | |
| list_of_dict.append(data_dic) | |
| if response.error.message: | |
| raise Exception( | |
| "{}\nFor more info on error messages, check: " | |
| "https://cloud.google.com/apis/design/errors".format(response.error.message) | |
| ) | |
| return list_of_dict | |
| def prepare_batch_for_inference(image_paths): | |
| # tesseract_outputs is a list of paths | |
| inference_batch = dict() | |
| # tesseract_outputs = [run_tesseract_on_image( | |
| # image_path) for image_path in image_paths] | |
| # tesseract_outputs = [] | |
| # for image_path in image_paths: | |
| # output = run_tesseract_on_image(image_path) | |
| # tesseract_outputs.append(output) | |
| # clean_outputs is a list of lists | |
| # clean_outputs = [clean_tesseract_output( | |
| # tsv_path) for tsv_path in tesseract_outputs] | |
| # clean_outputs = [] | |
| # for tsv_path in tesseract_outputs: | |
| # output = clean_tesseract_output(tsv_path) | |
| # clean_outputs.append(output) | |
| clean_outputs = [] | |
| for image_path in image_paths: | |
| output = detect_text(image_path) | |
| clean_outputs.append(output) | |
| print("clean_outputs",clean_outputs) | |
| word_lists = [[word['word_text'] for word in clean_output] | |
| for clean_output in clean_outputs] | |
| boxes_lists = [[word['word_box'] for word in clean_output] | |
| for clean_output in clean_outputs] | |
| inference_batch = { | |
| "image_path": image_paths, | |
| "bboxes": boxes_lists, | |
| "words": word_lists | |
| } | |
| return inference_batch | |