import re
import sys
import pdfplumber
import docx
import json
import logging
import os
import pytesseract
from pdf2image import convert_from_path

# === Setup paths for OCR/pdf2image ===
os.environ["PATH"] += r";C:\Program Files\poppler-24.08.0\Library\bin"
pytesseract.pytesseract.tesseract_cmd = r'C:\Program Files\Tesseract-OCR\tesseract.exe'

# Console logging
logging.basicConfig(level=logging.INFO, format="%(asctime)s [%(levelname)s] %(message)s")
logger = logging.getLogger(__name__)

# ---------------- Helper functions ----------------

def extract_text_from_docx(file_path):
    logger.info("Extracting text from DOCX")
    doc = docx.Document(file_path)
    return '\n'.join([para.text for para in doc.paragraphs])

def extract_text_from_pdf(file_path):
    logger.info("Extracting text from PDF")
    try:
        with pdfplumber.open(file_path) as pdf:
            text = ''.join([page.extract_text() or '' for page in pdf.pages])
        return text
    except Exception as e:
        logger.exception("Error reading PDF")
        return json.dumps({"error": str(e)})

def extract_text_from_scanned_pdf(file_path):
    logger.info("Extracting text from scanned PDF using OCR")
    images = convert_from_path(file_path)
    text = ''
    for img in images:
        text += pytesseract.image_to_string(img, config='--oem 3 --psm 6')
    return text

def is_scanned_pdf(file_path):
    with pdfplumber.open(file_path) as pdf:
        for page in pdf.pages:
            if page.extract_text():
                return False
    return True

def extract_contract_info(text):
    logger.info("Starting contract information extraction...")
    
    # Store the raw text for debugging
    raw_text = text
    
    # First, clean the text
    text = text.replace('\n', ' ').replace('\r', ' ')
    text = re.sub(r'\s+', ' ', text).strip()
    
    # Convert to lowercase for pattern matching (but keep original for extraction)
    text_lower = text.lower()
    
    contract_info = {}
    
    # 1. Extract Employee Name
    name_patterns = [
        r'between talent fusion solutions.*?and\s+([a-z][a-z\s]+?)\s*\(herein',
        r'and\s+([a-z][a-z\s]+?)\s*\(herein referred to as the employee\)',
        r'employee\) and\s+([a-z][a-z\s]+?)\s*whereas',
    ]
    
    for pattern in name_patterns:
        match = re.search(pattern, text_lower, re.IGNORECASE)
        if match:
            # Extract from original text to preserve case
            start_pos = text_lower.find(match.group(1))
            end_pos = start_pos + len(match.group(1))
            contract_info['employee_name'] = text[start_pos:end_pos].strip().title()
            logger.info(f"Found employee name: {contract_info['employee_name']}")
            break
    
    if 'employee_name' not in contract_info:
        contract_info['employee_name'] = 'Not found'
        logger.warning("Employee name not found")
    
    # 2. Extract Remuneration (simplified approach)
    # Look for USD$ followed by numbers in the text
    remuneration_found = False
    
    # Method 1: Look for "USD$" pattern
    usd_patterns = [
        r'USD\$\s*(\d+(?:,\d{3})*(?:\.\d{2})?)',
        r'USD\s*\$\s*(\d+(?:,\d{3})*(?:\.\d{2})?)',
        r'\$\s*(\d+(?:,\d{3})*(?:\.\d{2})?)\s*\([^)]*dollars[^)]*\)',
    ]
    
    for pattern in usd_patterns:
        match = re.search(pattern, text, re.IGNORECASE)
        if match:
            contract_info['remuneration'] = match.group(1)
            remuneration_found = True
            logger.info(f"Found remuneration with USD pattern: {contract_info['remuneration']}")
            break
    
    # Method 2: Look for "monthly salary" context
    if not remuneration_found:
        salary_patterns = [
            r'monthly salary of[^\d]*(\d+(?:,\d{3})*(?:\.\d{2})?)',
            r'salary of[^\d]*(\d+(?:,\d{3})*(?:\.\d{2})?)',
            r'paid.*?salary.*?of[^\d]*(\d+(?:,\d{3})*(?:\.\d{2})?)',
        ]
        
        for pattern in salary_patterns:
            match = re.search(pattern, text_lower)
            if match:
                contract_info['remuneration'] = match.group(1)
                remuneration_found = True
                logger.info(f"Found remuneration with salary pattern: {contract_info['remuneration']}")
                break
    
    # Method 3: Look for any $ amount that looks like a salary (likely > 100)
    if not remuneration_found:
        all_amounts = re.findall(r'\$(\d+(?:,\d{3})*(?:\.\d{2})?)', text)
        for amount in all_amounts:
            # Clean the amount
            clean_amount = amount.replace(',', '')
            try:
                amount_num = float(clean_amount)
                # Assume salary is likely > 100
                if amount_num > 100:
                    contract_info['remuneration'] = amount
                    remuneration_found = True
                    logger.info(f"Found remuneration from generic $ amount: {contract_info['remuneration']}")
                    break
            except:
                pass
    
    if not remuneration_found:
        contract_info['remuneration'] = 'Not found'
        logger.warning("Remuneration not found")
    
    # 3. Extract Transport Allowance
    transport_found = False
    
    # Look specifically for transport allowance
    transport_patterns = [
        r'transport allowance.*?USD\$\s*(\d+(?:,\d{3})*(?:\.\d{2})?)',
        r'transport allowance.*?\$\s*(\d+(?:,\d{3})*(?:\.\d{2})?)',
        r'allowance of.*?USD\$\s*(\d+(?:,\d{3})*(?:\.\d{2})?)',
    ]
    
    for pattern in transport_patterns:
        match = re.search(pattern, text_lower)
        if match:
            contract_info['transport_allowance'] = match.group(1)
            transport_found = True
            logger.info(f"Found transport allowance: {contract_info['transport_allowance']}")
            break
    
    # If not found, look for any $ amount near "transport"
    if not transport_found:
        transport_context = re.search(r'transport[^$]{0,100}', text_lower, re.DOTALL)
        if transport_context:
            transport_text = transport_context.group(0)
            amount_match = re.search(r'\$(\d+(?:,\d{3})*(?:\.\d{2})?)', transport_text)
            if amount_match:
                contract_info['transport_allowance'] = amount_match.group(1)
                transport_found = True
                logger.info(f"Found transport allowance in context: {contract_info['transport_allowance']}")
    
    if not transport_found:
        contract_info['transport_allowance'] = 'Not found'
        logger.warning("Transport allowance not found")
    
    # 4. Extract Position
    position_patterns = [
        r'position\s*as\s+(?:an?\s+)?([^,\.]{3,50}?)(?:\s*(?:it\s+is|\.|,|$))',
        r'as\s+(?:an?\s+)?([^,\.]{3,50}?)\s*it\s+is\s+the\s+duty',
        r'3\.\s*position[^a-z]*([^,\.]{3,50}?)(?:\s|\.|,)',
    ]
    
    for pattern in position_patterns:
        match = re.search(pattern, text_lower)
        if match:
            position = match.group(1).strip()
            # Extract from original text to preserve case
            start_pos = text_lower.find(position)
            end_pos = start_pos + len(position)
            contract_info['position'] = text[start_pos:end_pos].strip()
            logger.info(f"Found position: {contract_info['position']}")
            break
    
    if 'position' not in contract_info:
        contract_info['position'] = 'Not found'
        logger.warning("Position not found")
    
    # 5. Extract Dates
    # Start date
    start_patterns = [
        r'commenced on the\s+(\d{1,2}(?:st|nd|rd|th)?\s+of\s+\w+\s+\d{4})',
        r'commenced on\s+the\s+(\d{1,2}(?:st|nd|rd|th)?\s+of\s+\w+\s+\d{4})',
        r'deemed to have commenced on the\s+(\d{1,2}(?:st|nd|rd|th)?\s+of\s+\w+\s+\d{4})',
    ]
    
    for pattern in start_patterns:
        match = re.search(pattern, text, re.IGNORECASE)
        if match:
            contract_info['start_date'] = match.group(1)
            logger.info(f"Found start date: {contract_info['start_date']}")
            break
    
    if 'start_date' not in contract_info:
        contract_info['start_date'] = 'Not found'
        logger.warning("Start date not found")
    
    # End date
    end_patterns = [
        r'expire on the\s+(\d{1,2}(?:st|nd|rd|th)?\s+of\s+\w+\s+\d{4})',
        r'expire\s+on\s+the\s+(\d{1,2}(?:st|nd|rd|th)?\s+of\s+\w+\s+\d{4})',
        r'automatically expire on the\s+(\d{1,2}(?:st|nd|rd|th)?\s+of\s+\w+\s+\d{4})',
    ]
    
    for pattern in end_patterns:
        match = re.search(pattern, text, re.IGNORECASE)
        if match:
            contract_info['end_date'] = match.group(1)
            logger.info(f"Found end date: {contract_info['end_date']}")
            break
    
    if 'end_date' not in contract_info:
        contract_info['end_date'] = 'Not found'
        logger.warning("End date not found")
    
    # Contract date
    contract_date_patterns = [
        r'dated on the\s+(\d{1,2}(?:st|nd|rd|th)?\s+day of\s+\w+\s+in the year\s+\d{4})',
        r'dated on the\s+(\d{1,2}(?:st|nd|rd|th)?\s+\w+\s+\d{4})',
        r'dated\s+(\d{1,2}(?:st|nd|rd|th)?\s+\w+\s+\d{4})',
    ]
    
    for pattern in contract_date_patterns:
        match = re.search(pattern, text, re.IGNORECASE)
        if match:
            contract_info['contract_date'] = match.group(1)
            logger.info(f"Found contract date: {contract_info['contract_date']}")
            break
    
    if 'contract_date' not in contract_info:
        contract_info['contract_date'] = 'Not found'
        logger.warning("Contract date not found")
    
    # 6. Extract Notice Period
    notice_patterns = [
        r'notice period.*?(\d+\s+(?:month|months|year|years|week|weeks))',
        r'giving notices.*?(\d+\s+(?:month|months))',
        r'minimum notice period.*?(\d+\s+(?:month|months))',
    ]
    
    for pattern in notice_patterns:
        match = re.search(pattern, text_lower)
        if match:
            contract_info['notice_period'] = match.group(1)
            logger.info(f"Found notice period: {contract_info['notice_period']}")
            break
    
    if 'notice_period' not in contract_info:
        contract_info['notice_period'] = 'Not found'
        logger.warning("Notice period not found")
    
    # 7. Extract Probation Period
    probation_patterns = [
        r'first\s+(\d+)\s+months?.*?probationary',
        r'probationary period.*?(\d+)\s+months?',
        r'probation period.*?(\d+)\s+months?',
    ]
    
    for pattern in probation_patterns:
        match = re.search(pattern, text_lower)
        if match:
            contract_info['probation_period'] = match.group(1)
            logger.info(f"Found probation period: {contract_info['probation_period']}")
            break
    
    if 'probation_period' not in contract_info:
        contract_info['probation_period'] = '3'  # Default
        logger.info("Set default probation period: 3")
    
    # 8. Extract Authority
    authority_patterns = [
        r'yours faithfully,\s*([a-z][a-z\s]+?)\s+managing director',
        r'yours faithfully,\s*([a-z][a-z\s]+?)(?:\s|$)',
        r'faithfully,\s*([a-z][a-z\s]+?)\s*director',
    ]
    
    for pattern in authority_patterns:
        match = re.search(pattern, text_lower)
        if match:
            # Extract from original text to preserve case
            authority = match.group(1).strip()
            start_pos = text_lower.find(authority)
            end_pos = start_pos + len(authority)
            contract_info['authority'] = text[start_pos:end_pos].strip()
            logger.info(f"Found authority: {contract_info['authority']}")
            break
    
    if 'authority' not in contract_info:
        contract_info['authority'] = 'Not found'
        logger.warning("Authority not found")
    
    # 9. Extract Signature
    signature_patterns = [
        r'signature:\s*(.*?)\s*date:',
        r'signature:\s*([^\n]+)',
    ]
    
    for pattern in signature_patterns:
        match = re.search(pattern, text, re.IGNORECASE)
        if match:
            signature = match.group(1).strip()
            if signature and not re.match(r'^[\.…\-_]+$', signature):
                contract_info['signature'] = signature
                logger.info(f"Found signature: {contract_info['signature']}")
            else:
                contract_info['signature'] = 'Not Signed'
                logger.info("Signature marked as 'Not Signed'")
            break
    
    if 'signature' not in contract_info:
        contract_info['signature'] = 'Not Signed'
        logger.info("Signature not found, marked as 'Not Signed'")
    
    # 10. Determine Contract Type
    if 'fixed' in text_lower or 'fixed-term' in text_lower:
        contract_info['contract_type'] = 'fixed-term'
    elif 'indefinite' in text_lower:
        contract_info['contract_type'] = 'indefinite'
    elif contract_info['end_date'] != 'Not found' and contract_info['start_date'] != 'Not found':
        contract_info['contract_type'] = 'fixed-term'
    else:
        contract_info['contract_type'] = 'indefinite'
    
    logger.info(f"Contract type determined: {contract_info['contract_type']}")
    
    # 11. Try to extract address (if present)
    address_patterns = [
        r'address:\s*([^\.]{10,100}?)(?:\s|\.|$)',
        r'residing at\s*([^\.]{10,100}?)(?:\s|\.|$)',
    ]
    
    for pattern in address_patterns:
        match = re.search(pattern, text_lower)
        if match:
            address = match.group(1).strip()
            # Extract from original text
            start_pos = text_lower.find(address)
            end_pos = start_pos + len(address)
            contract_info['address'] = text[start_pos:end_pos].strip()
            logger.info(f"Found address: {contract_info['address']}")
            break
    
    if 'address' not in contract_info:
        contract_info['address'] = 'Not found'
        logger.warning("Address not found")
    
    # Log final extracted information
    logger.info("=== FINAL EXTRACTED INFORMATION ===")
    for key, value in contract_info.items():
        logger.info(f"{key}: {value}")
    logger.info("===================================")
    
    return contract_info

# ---------------- Main processing function ----------------

def process_file(file_path):
    if not os.path.isfile(file_path):
        logger.error("File does not exist")
        return {"error": "File does not exist."}

    ext = file_path.split('.')[-1].lower()
    logger.info(f"File extension: {ext}")

    if ext == 'pdf':
        text = extract_text_from_scanned_pdf(file_path) if is_scanned_pdf(file_path) else extract_text_from_pdf(file_path)
    elif ext == 'docx':
        text = extract_text_from_docx(file_path)
    else:
        logger.error("Unsupported file format")
        return {"error": "Unsupported file format"}
    
    # Log a sample of the extracted text for debugging
    logger.info(f"Raw text sample (first 800 chars):\n{text[:800]}")
    
    return extract_contract_info(text)

# ---------------- CLI support ----------------
if __name__ == "__main__":
    if len(sys.argv) < 2:
        print(json.dumps({"error": "File path not provided."}))
    else:
        print(json.dumps(process_file(sys.argv[1]), indent=4))