Spaces:

AuditEdge
/

optimised-ocr

Running

App Files Files Community

optimised-ocr / layoutlmv3FineTuning /Layoutlm_inference /ocr.py

AuditEdge

Updated Vision Key - Hugginsface token - handled s3 bucket issues

bf8452b 4 months ago

raw

history blame contribute delete

4.09 kB

	import os
	import pandas as pd

	import os
	os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = "./audit-edge-cloud-products-6eb51d237f19.json"




	def run_tesseract_on_image(image_path): # -> tsv output path
	print("image_path",image_path)
	image_name = os.path.basename(image_path)
	image_name = image_name[:image_name.find('.')]
	error_code = os.system(f'''
	tesseract "{image_path}" "/content/{image_name}" -l eng tsv
	''')
	if not error_code:
	return f"/content/{image_name}.tsv"
	else:
	raise ValueError('Tesseract OCR Error please verify image format PNG,JPG,JPEG')


	def clean_tesseract_output(tsv_output_path):
	print("tsv_output_path",tsv_output_path)
	ocr_df = pd.read_csv(tsv_output_path, sep='\t')
	ocr_df = ocr_df.dropna()
	ocr_df = ocr_df.drop(ocr_df[ocr_df.text.str.strip() == ''].index)
	text_output = ' '.join(ocr_df.text.tolist())
	words = []
	for index, row in ocr_df.iterrows():
	word = {}
	origin_box = [row['left'], row['top'], row['left'] +
	row['width'], row['top']+row['height']]
	word['word_text'] = row['text']
	word['word_box'] = origin_box
	words.append(word)
	return words




	def detect_text(path):
	print("this is path:",path)

	"""Detects text in the file."""
	from google.cloud import vision
	client = vision.ImageAnnotatorClient()
	with open(path, "rb") as image_file:
	content = image_file.read()
	image = vision.Image(content=content)
	response = client.text_detection(image=image)
	texts = response.text_annotations
	print("Texts:")
	list_of_dict = []
	for text in texts[1:]:
	data_dic = {}
	print(f'\n"{text.description}"')
	data_dic["word_text"] = text.description

	vertices_list = [[int(vertex.x),int(vertex.y)] for vertex in text.bounding_poly.vertices]
	print("vertices_list",vertices_list)


	coords = vertices_list

	sorted_coords = sorted(coords, key=lambda coord: (coord[0] + coord[1]))

	# Top-left is the first in the sorted list (smallest sum of x, y)
	top_left = sorted_coords[0]

	# Bottom-right is the last in the sorted list (largest sum of x, y)
	bottom_right = sorted_coords[-1]

	ls = []
	ls.append(top_left[0])
	ls.append(top_left[1])
	ls.append(bottom_right[0])
	ls.append(bottom_right[1])

	# print(ls)

	# ls = []

	# ls.append(vertices_list[0][0])
	# ls.append(vertices_list[0][1])
	# ls.append(vertices_list[2][0])
	# ls.append(vertices_list[2][1])

	data_dic["word_box"] = ls

	list_of_dict.append(data_dic)

	if response.error.message:
	raise Exception(
	"{}\nFor more info on error messages, check: "
	"https://cloud.google.com/apis/design/errors".format(response.error.message)
	)

	return list_of_dict




	def prepare_batch_for_inference(image_paths):
	# tesseract_outputs is a list of paths
	inference_batch = dict()
	# tesseract_outputs = [run_tesseract_on_image(
	# image_path) for image_path in image_paths]

	# tesseract_outputs = []
	# for image_path in image_paths:

	# output = run_tesseract_on_image(image_path)
	# tesseract_outputs.append(output)

	# clean_outputs is a list of lists
	# clean_outputs = [clean_tesseract_output(
	# tsv_path) for tsv_path in tesseract_outputs]

	# clean_outputs = []
	# for tsv_path in tesseract_outputs:
	# output = clean_tesseract_output(tsv_path)
	# clean_outputs.append(output)


	clean_outputs = []
	for image_path in image_paths:

	output = detect_text(image_path)
	clean_outputs.append(output)

	print("clean_outputs",clean_outputs)


	word_lists = [[word['word_text'] for word in clean_output]
	for clean_output in clean_outputs]
	boxes_lists = [[word['word_box'] for word in clean_output]
	for clean_output in clean_outputs]
	inference_batch = {
	"image_path": image_paths,
	"bboxes": boxes_lists,
	"words": word_lists
	}
	return inference_batch