Bapt120 commited on
Commit
3654ed1
·
verified ·
1 Parent(s): 6807791

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +83 -27
app.py CHANGED
@@ -1,13 +1,11 @@
1
  #!/usr/bin/env python3
2
  import subprocess
3
  import sys
4
-
5
 
6
  import spaces
7
  import torch
8
 
9
-
10
-
11
  import gradio as gr
12
  from PIL import Image
13
  from io import BytesIO
@@ -15,6 +13,7 @@ import pypdfium2 as pdfium
15
  from transformers import (
16
  LightOnOCRForConditionalGeneration,
17
  LightOnOCRProcessor,
 
18
  )
19
 
20
  device = "cuda" if torch.cuda.is_available() else "cpu"
@@ -68,8 +67,35 @@ def process_pdf(pdf_path, page_num=1):
68
  return img, total_pages, page_idx + 1
69
 
70
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
71
  @spaces.GPU
72
- def extract_text_from_image(image, temperature=0.2):
73
  """Extract text from image using LightOnOCR model."""
74
  # Prepare the chat format
75
  chat = [
@@ -98,26 +124,55 @@ def extract_text_from_image(image, temperature=0.2):
98
  for k, v in inputs.items()
99
  }
100
 
101
- # Generate text with appropriate settings
102
- with torch.no_grad(): # Disable gradients for inference
103
- outputs = model.generate(
104
- **inputs,
105
- max_new_tokens=2048,
106
- temperature=temperature if temperature > 0 else 0.0,
107
- use_cache=True,
108
- do_sample=temperature > 0,
109
- )
110
-
111
- # Decode the output
112
- output_text = processor.decode(outputs[0], skip_special_tokens=True)
113
 
114
- return output_text
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
115
 
116
 
117
  def process_input(file_input, temperature, page_num):
118
- """Process uploaded file (image or PDF) and extract text."""
119
  if file_input is None:
120
- return "Please upload an image or PDF first.", "", "", None, gr.update()
 
121
 
122
  image_to_process = None
123
  page_info = ""
@@ -130,24 +185,25 @@ def process_input(file_input, temperature, page_num):
130
  image_to_process, total_pages, actual_page = process_pdf(file_path, int(page_num))
131
  page_info = f"Processing page {actual_page} of {total_pages}"
132
  except Exception as e:
133
- return f"Error processing PDF: {str(e)}", "", "", None, gr.update()
 
134
  # Handle image files
135
  else:
136
  try:
137
  image_to_process = Image.open(file_path)
138
  page_info = "Processing image"
139
  except Exception as e:
140
- return f"Error opening image: {str(e)}", "", "", None, gr.update()
 
141
 
142
  try:
143
- # Extract text using LightOnOCR
144
- extracted_text = extract_text_from_image(image_to_process, temperature)
145
-
146
- return extracted_text, extracted_text, page_info, image_to_process, gr.update()
147
 
148
  except Exception as e:
149
  error_msg = f"Error during text extraction: {str(e)}"
150
- return error_msg, error_msg, page_info, image_to_process, gr.update()
151
 
152
 
153
  def update_slider(file_input):
@@ -178,7 +234,7 @@ with gr.Blocks(title="📖 Image/PDF OCR with LightOnOCR", theme=gr.themes.Soft(
178
  1. Upload an image or PDF
179
  2. For PDFs: select which page to extract (1-20)
180
  3. Adjust temperature if needed (0.0 for deterministic, higher for more varied output)
181
- 4. Click "Extract Text"
182
 
183
  **Note:** The Markdown rendering for tables may not always be perfect. Check the raw output for complex tables!
184
 
 
1
  #!/usr/bin/env python3
2
  import subprocess
3
  import sys
4
+ import threading
5
 
6
  import spaces
7
  import torch
8
 
 
 
9
  import gradio as gr
10
  from PIL import Image
11
  from io import BytesIO
 
13
  from transformers import (
14
  LightOnOCRForConditionalGeneration,
15
  LightOnOCRProcessor,
16
+ TextIteratorStreamer,
17
  )
18
 
19
  device = "cuda" if torch.cuda.is_available() else "cpu"
 
67
  return img, total_pages, page_idx + 1
68
 
69
 
70
+ def clean_output_text(text):
71
+ """Remove chat template artifacts from output."""
72
+ # Remove common chat template markers
73
+ markers_to_remove = ["system", "user", "assistant"]
74
+
75
+ # Split by lines and filter
76
+ lines = text.split('\n')
77
+ cleaned_lines = []
78
+
79
+ for line in lines:
80
+ stripped = line.strip()
81
+ # Skip lines that are just template markers
82
+ if stripped.lower() not in markers_to_remove:
83
+ cleaned_lines.append(line)
84
+
85
+ # Join back and strip leading/trailing whitespace
86
+ cleaned = '\n'.join(cleaned_lines).strip()
87
+
88
+ # Alternative approach: if there's an "assistant" marker, take everything after it
89
+ if "assistant" in text.lower():
90
+ parts = text.split("assistant", 1)
91
+ if len(parts) > 1:
92
+ cleaned = parts[1].strip()
93
+
94
+ return cleaned
95
+
96
+
97
  @spaces.GPU
98
+ def extract_text_from_image(image, temperature=0.2, stream=False):
99
  """Extract text from image using LightOnOCR model."""
100
  # Prepare the chat format
101
  chat = [
 
124
  for k, v in inputs.items()
125
  }
126
 
127
+ generation_kwargs = dict(
128
+ **inputs,
129
+ max_new_tokens=2048,
130
+ temperature=temperature if temperature > 0 else 0.0,
131
+ use_cache=True,
132
+ do_sample=temperature > 0,
133
+ )
 
 
 
 
 
134
 
135
+ if stream:
136
+ # Setup streamer for streaming generation
137
+ streamer = TextIteratorStreamer(
138
+ processor.tokenizer,
139
+ skip_prompt=True,
140
+ skip_special_tokens=True
141
+ )
142
+ generation_kwargs["streamer"] = streamer
143
+
144
+ # Run generation in a separate thread
145
+ thread = threading.Thread(target=model.generate, kwargs=generation_kwargs)
146
+ thread.start()
147
+
148
+ # Yield chunks as they arrive
149
+ full_text = ""
150
+ for new_text in streamer:
151
+ full_text += new_text
152
+ # Clean the accumulated text
153
+ cleaned_text = clean_output_text(full_text)
154
+ yield cleaned_text
155
+
156
+ thread.join()
157
+ else:
158
+ # Non-streaming generation
159
+ with torch.no_grad():
160
+ outputs = model.generate(**generation_kwargs)
161
+
162
+ # Decode the output
163
+ output_text = processor.decode(outputs[0], skip_special_tokens=True)
164
+
165
+ # Clean the output
166
+ cleaned_text = clean_output_text(output_text)
167
+
168
+ yield cleaned_text
169
 
170
 
171
  def process_input(file_input, temperature, page_num):
172
+ """Process uploaded file (image or PDF) and extract text with streaming."""
173
  if file_input is None:
174
+ yield "Please upload an image or PDF first.", "", "", None, gr.update()
175
+ return
176
 
177
  image_to_process = None
178
  page_info = ""
 
185
  image_to_process, total_pages, actual_page = process_pdf(file_path, int(page_num))
186
  page_info = f"Processing page {actual_page} of {total_pages}"
187
  except Exception as e:
188
+ yield f"Error processing PDF: {str(e)}", "", "", None, gr.update()
189
+ return
190
  # Handle image files
191
  else:
192
  try:
193
  image_to_process = Image.open(file_path)
194
  page_info = "Processing image"
195
  except Exception as e:
196
+ yield f"Error opening image: {str(e)}", "", "", None, gr.update()
197
+ return
198
 
199
  try:
200
+ # Extract text using LightOnOCR with streaming
201
+ for extracted_text in extract_text_from_image(image_to_process, temperature, stream=True):
202
+ yield extracted_text, extracted_text, page_info, image_to_process, gr.update()
 
203
 
204
  except Exception as e:
205
  error_msg = f"Error during text extraction: {str(e)}"
206
+ yield error_msg, error_msg, page_info, image_to_process, gr.update()
207
 
208
 
209
  def update_slider(file_input):
 
234
  1. Upload an image or PDF
235
  2. For PDFs: select which page to extract (1-20)
236
  3. Adjust temperature if needed (0.0 for deterministic, higher for more varied output)
237
+ 4. Click "Extract Text" (now with streaming! ✨)
238
 
239
  **Note:** The Markdown rendering for tables may not always be perfect. Check the raw output for complex tables!
240