Spaces:
Running
Running
File size: 4,451 Bytes
168e3cd d3333ba 1bd3e46 64aa8c0 168e3cd bdc7119 1bd3e46 bdc7119 64aa8c0 1bd3e46 64aa8c0 1bd3e46 64aa8c0 1bd3e46 64aa8c0 1bd3e46 64aa8c0 1bd3e46 64aa8c0 1bd3e46 64aa8c0 1bd3e46 64aa8c0 1bd3e46 168e3cd bdc7119 168e3cd bdc7119 168e3cd d3333ba 1bd3e46 d3333ba bdc7119 1bd3e46 64aa8c0 1bd3e46 64aa8c0 1bd3e46 168e3cd d3333ba 168e3cd bdc7119 64aa8c0 1bd3e46 64aa8c0 1bd3e46 64aa8c0 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 |
# prompts.py
# Qwen-2.5 Compatible Prompts
# Strict SOP for "Document Processing SOP for Zoho Invoice Integration"
# Outputs MUST be wrapped between <<<JSON>>> and <<<END_JSON>>> markers.
from typing import Optional
def get_ocr_extraction_prompt(raw_text: str, page_count: int = 1) -> str:
"""
Builds a strict LLM prompt to:
- classify the doc
- extract fields into a fixed JSON schema
- validate totals and dates
- output ONLY the JSON between <<<JSON>>> and <<<END_JSON>>>
raw_text will be truncated by caller if long.
"""
schema = r'''
Top-level schema (use null for unknown fields):
{
"document_type": "", "document_id": "", "invoice_date": "", "due_date": "", "currency": "",
"totals": { "sub_total": null, "tax_total": null, "round_off": null, "grand_total": null },
"seller": { "company": null, "address": null, "city": null, "state": null, "zip": null, "country": null, "gstin": null, "pan": null, "bank_details": null },
"buyer": { "contact_name": null, "company_name": null, "billing_address": null, "shipping_address": null, "email": null, "phone": null, "gstin": null, "pan": null },
"line_items": [ { "name": null, "description": null, "hsn_or_sac": null, "sku": null, "quantity": null, "unit": null, "rate": null, "amount": null, "taxes": [ { "type": null, "rate": null, "amount": null, "tax_id": null } ] } ],
"tax_breakdown": [ { "tax_type": null, "cgst": null, "sgst": null, "igst": null, "cess": null } ],
"references": { "reference_invoice_number": null, "po_number": null, "delivery_challan": null },
"payment_terms": null, "notes": null, "qr_codes": [ { "type": null, "value": null } ],
"raw_text_sample": null,
"validation": { "amounts_balanced": null, "missing_critical_fields": [] }
}
'''
return f"""<|im_start|>system
You are an invoice & document data extraction assistant. Follow instructions exactly.
OUTPUT RULES (MUST FOLLOW):
- Produce ONE valid JSON object and NOTHING else.
- Wrap JSON between EXACT markers with no extra commentary:
<<<JSON>>>
{{ ... }}
<<<END_JSON>>>
- Use double quotes for all JSON strings. No trailing commas.
- Dates must be ISO YYYY-MM-DD or null. Numeric fields must be numbers or null.
- If unknown, use null or empty list/object as appropriate.
SCHEMA:
{schema}
VALIDATION:
- Normalize and validate dates; if unparseable set null and add to validation.missing_critical_fields.
- Normalize numeric values (remove commas/currency symbols). If conversion fails set null and add to missing_critical_fields.
- Set validation.amounts_balanced = true only if sum(line_items.amount) + totals.tax_total ± totals.round_off equals totals.grand_total (tolerance 0.5).
- Include up to first 3000 chars of raw text in raw_text_sample.
MULTI-PAGE:
- page_count = {page_count}. Merge line_items across pages.
Do NOT call external APIs. Output only the JSON between the markers.
<|im_end|>
<|im_start|>user
Input Text (first 3000 chars):
{raw_text[:3000]}
<|im_end|>
<|im_start|assistant
"""
def get_agent_prompt(history_text: str, user_message: str) -> str:
"""
Orchestrator prompt. When asked to persist, output EXACT tool-call JSON:
{ "tool": "<tool_name>", "args": { ... } }
Otherwise produce a human-friendly summary (no tool JSON).
"""
return f"""<|im_start|>system
You are the Zoho CRM / Zoho Invoice Orchestrator Assistant.
TOOLS (only call when user explicitly requests persist/save/create/push/upload):
- create_contact(contact_json)
- create_item(item_json)
- create_invoice(invoice_json)
- create_creditnote(creditnote_json)
MANDATES:
- If calling a tool, output ONLY a single JSON object:
{{ "tool": "<tool_name>", "args": {{ ... }} }}
and nothing else.
- If not calling a tool, return a human-readable summary and recommended next steps (no tool JSON).
- If validation.amounts_balanced is false or critical fields missing, DO NOT call tools; ask for manual review.
<|im_end|>
<|im_start|>user
HISTORY:
{history_text}
CURRENT REQUEST:
{user_message}
<|im_end|>
<|im_start|assistant
"""
# small helper prompt used by app when validating parsed JSON quickly
def get_quick_extraction_check_prompt(summary: str) -> str:
return f"""You are a JSON validator. Check the JSON below for required fields: document_id, invoice_date, totals.grand_total, buyer.contact_name.
Return only a JSON: {{ "missing_fields": [...], "parse_warnings": [...], "ok": true|false }}
Input:
{summary}
"""
|