# prompts.py # Qwen-2.5 Compatible Prompts # Strict SOP for "Document Processing SOP for Zoho Invoice Integration" # Outputs MUST be wrapped between <<>> and <<>> markers. from typing import Optional def get_ocr_extraction_prompt(raw_text: str, page_count: int = 1) -> str: """ Builds a strict LLM prompt to: - classify the doc - extract fields into a fixed JSON schema - validate totals and dates - output ONLY the JSON between <<>> and <<>> raw_text will be truncated by caller if long. """ schema = r''' Top-level schema (use null for unknown fields): { "document_type": "", "document_id": "", "invoice_date": "", "due_date": "", "currency": "", "totals": { "sub_total": null, "tax_total": null, "round_off": null, "grand_total": null }, "seller": { "company": null, "address": null, "city": null, "state": null, "zip": null, "country": null, "gstin": null, "pan": null, "bank_details": null }, "buyer": { "contact_name": null, "company_name": null, "billing_address": null, "shipping_address": null, "email": null, "phone": null, "gstin": null, "pan": null }, "line_items": [ { "name": null, "description": null, "hsn_or_sac": null, "sku": null, "quantity": null, "unit": null, "rate": null, "amount": null, "taxes": [ { "type": null, "rate": null, "amount": null, "tax_id": null } ] } ], "tax_breakdown": [ { "tax_type": null, "cgst": null, "sgst": null, "igst": null, "cess": null } ], "references": { "reference_invoice_number": null, "po_number": null, "delivery_challan": null }, "payment_terms": null, "notes": null, "qr_codes": [ { "type": null, "value": null } ], "raw_text_sample": null, "validation": { "amounts_balanced": null, "missing_critical_fields": [] } } ''' return f"""<|im_start|>system You are an invoice & document data extraction assistant. Follow instructions exactly. OUTPUT RULES (MUST FOLLOW): - Produce ONE valid JSON object and NOTHING else. - Wrap JSON between EXACT markers with no extra commentary: <<>> {{ ... }} <<>> - Use double quotes for all JSON strings. No trailing commas. - Dates must be ISO YYYY-MM-DD or null. Numeric fields must be numbers or null. - If unknown, use null or empty list/object as appropriate. SCHEMA: {schema} VALIDATION: - Normalize and validate dates; if unparseable set null and add to validation.missing_critical_fields. - Normalize numeric values (remove commas/currency symbols). If conversion fails set null and add to missing_critical_fields. - Set validation.amounts_balanced = true only if sum(line_items.amount) + totals.tax_total ± totals.round_off equals totals.grand_total (tolerance 0.5). - Include up to first 3000 chars of raw text in raw_text_sample. MULTI-PAGE: - page_count = {page_count}. Merge line_items across pages. Do NOT call external APIs. Output only the JSON between the markers. <|im_end|> <|im_start|>user Input Text (first 3000 chars): {raw_text[:3000]} <|im_end|> <|im_start|assistant """ def get_agent_prompt(history_text: str, user_message: str) -> str: """ Orchestrator prompt. When asked to persist, output EXACT tool-call JSON: { "tool": "", "args": { ... } } Otherwise produce a human-friendly summary (no tool JSON). """ return f"""<|im_start|>system You are the Zoho CRM / Zoho Invoice Orchestrator Assistant. TOOLS (only call when user explicitly requests persist/save/create/push/upload): - create_contact(contact_json) - create_item(item_json) - create_invoice(invoice_json) - create_creditnote(creditnote_json) MANDATES: - If calling a tool, output ONLY a single JSON object: {{ "tool": "", "args": {{ ... }} }} and nothing else. - If not calling a tool, return a human-readable summary and recommended next steps (no tool JSON). - If validation.amounts_balanced is false or critical fields missing, DO NOT call tools; ask for manual review. <|im_end|> <|im_start|>user HISTORY: {history_text} CURRENT REQUEST: {user_message} <|im_end|> <|im_start|assistant """ # small helper prompt used by app when validating parsed JSON quickly def get_quick_extraction_check_prompt(summary: str) -> str: return f"""You are a JSON validator. Check the JSON below for required fields: document_id, invoice_date, totals.grand_total, buyer.contact_name. Return only a JSON: {{ "missing_fields": [...], "parse_warnings": [...], "ok": true|false }} Input: {summary} """