"""
Fine-Tuning Data Export — Fogbreak AI.

Exports agent-approved AI outputs from the Fogbreak database
into training datasets for fine-tuning Mistral 7B.

Data sources:
- Listing descriptions (approved by agent)
- Email campaigns (high open/click rates)
- Social posts (high engagement)

Output: JSONL files in OpenAI chat-completion format.
"""

import json
import logging
import os
import sys
from dataclasses import dataclass
from datetime import datetime
from pathlib import Path
from typing import Optional

logger = logging.getLogger("fogbreak.finetune.export")

# Database connection via environment
DB_CONFIG = {
    "host": os.getenv("FOGBREAK_DB_HOST", "localhost"),
    "port": int(os.getenv("FOGBREAK_DB_PORT", "5432")),
    "database": os.getenv("FOGBREAK_DB_NAME", "fogbreak"),
    "user": os.getenv("FOGBREAK_DB_USER", "fogbreak"),
    "password": os.getenv("FOGBREAK_DB_PASSWORD", ""),
}


@dataclass
class TrainingExample:
    """A single fine-tuning training example."""
    system_prompt: str
    user_input: str
    assistant_output: str
    metadata: dict


def get_db_connection():
    """Get a database connection. Supports PostgreSQL."""
    try:
        import psycopg2
        return psycopg2.connect(**DB_CONFIG)
    except ImportError:
        logger.error("psycopg2 not installed. Run: pip install psycopg2-binary")
        sys.exit(1)


def export_listing_descriptions(
    conn,
    tenant_id: Optional[int] = None,
    min_approval_score: float = 0.8,
    output_dir: str = "training_data",
) -> int:
    """
    Export agent-approved listing descriptions as training data.
    Only includes descriptions that were approved (not edited heavily).
    """
    cursor = conn.cursor()

    query = """
        SELECT
            p.address,
            p.city,
            p.state,
            p.bedrooms,
            p.bathrooms,
            p.sqft,
            p.price,
            p.property_type,
            p.features,
            ai.generated_text,
            ai.approved_text,
            ai.approval_score
        FROM ai_generated_content ai
        JOIN properties p ON p.id = ai.source_id AND p.tenant_id = ai.tenant_id
        WHERE ai.content_type = 'listing_description'
        AND ai.status = 'approved'
        AND ai.approval_score >= %s
    """
    params = [min_approval_score]

    if tenant_id is not None:
        query += " AND ai.tenant_id = %s"
        params.append(tenant_id)
    else:
        query += " AND ai.tenant_id IS NOT NULL"

    query += " ORDER BY ai.created_at DESC"

    cursor.execute(query, params)
    rows = cursor.fetchall()

    examples = []
    for row in rows:
        (address, city, state, beds, baths, sqft, price,
         prop_type, features, generated, approved, score) = row

        # Use the approved text (may be lightly edited by agent)
        final_text = approved if approved else generated

        system_prompt = (
            "You are a luxury real estate copywriter. Write compelling, "
            "accurate listing descriptions that highlight key features "
            "and appeal to qualified buyers. Follow Fair Housing guidelines."
        )

        user_input = (
            f"Write a listing description for this property:\n"
            f"Address: {address}, {city}, {state}\n"
            f"Type: {prop_type}\n"
            f"Bedrooms: {beds} | Bathrooms: {baths} | Sqft: {sqft}\n"
            f"Price: ${price:,.0f}\n"
            f"Features: {features or 'N/A'}"
        )

        examples.append(TrainingExample(
            system_prompt=system_prompt,
            user_input=user_input,
            assistant_output=final_text,
            metadata={"type": "listing", "city": city, "score": score},
        ))

    cursor.close()
    count = _write_jsonl(examples, output_dir, "listing_descriptions.jsonl")
    logger.info(f"Exported {count} listing description training examples")
    return count


def export_email_campaigns(
    conn,
    tenant_id: Optional[int] = None,
    min_open_rate: float = 0.3,
    output_dir: str = "training_data",
) -> int:
    """
    Export high-performing email campaigns as training data.
    Filters for emails with above-average open and click rates.
    """
    cursor = conn.cursor()

    query = """
        SELECT
            ds.subject,
            ds.body,
            ds.step_type,
            dc.name AS campaign_name,
            dc.campaign_type,
            em.open_rate,
            em.click_rate
        FROM drip_steps ds
        JOIN drip_campaigns dc ON dc.id = ds.campaign_id AND dc.tenant_id = ds.tenant_id
        LEFT JOIN email_metrics em ON em.drip_step_id = ds.id AND em.tenant_id = ds.tenant_id
        WHERE ds.step_type = 'email'
        AND em.open_rate >= %s
    """
    params = [min_open_rate]

    if tenant_id is not None:
        query += " AND ds.tenant_id = %s"
        params.append(tenant_id)
    else:
        query += " AND ds.tenant_id IS NOT NULL"

    query += " ORDER BY em.open_rate DESC"

    cursor.execute(query, params)
    rows = cursor.fetchall()

    examples = []
    for row in rows:
        subject, body, step_type, campaign_name, campaign_type, open_rate, click_rate = row

        system_prompt = (
            "You are a real estate email marketing expert. Write engaging "
            "nurture emails that build trust, provide value, and maintain "
            "client relationships. Follow Fair Housing guidelines. "
            "Never use discriminatory language."
        )

        user_input = (
            f"Write a {campaign_type} nurture email.\n"
            f"Campaign: {campaign_name}\n"
            f"Subject line style: {subject}\n"
            f"Goal: Build relationship and provide value"
        )

        examples.append(TrainingExample(
            system_prompt=system_prompt,
            user_input=user_input,
            assistant_output=f"Subject: {subject}\n\n{body}",
            metadata={
                "type": "email",
                "campaign": campaign_type,
                "open_rate": open_rate,
                "click_rate": click_rate,
            },
        ))

    cursor.close()
    count = _write_jsonl(examples, output_dir, "email_campaigns.jsonl")
    logger.info(f"Exported {count} email campaign training examples")
    return count


def export_social_posts(
    conn,
    tenant_id: Optional[int] = None,
    min_engagement_rate: float = 0.05,
    output_dir: str = "training_data",
) -> int:
    """
    Export high-engagement social posts as training data.
    """
    cursor = conn.cursor()

    query = """
        SELECT
            sp.platform,
            sp.content,
            sp.post_type,
            sp.hashtags,
            sp.engagement_rate,
            sp.likes,
            sp.shares,
            sp.comments,
            p.address,
            p.city,
            p.price,
            p.property_type
        FROM social_posts sp
        LEFT JOIN properties p ON p.id = sp.property_id AND p.tenant_id = sp.tenant_id
        WHERE sp.status = 'published'
        AND sp.engagement_rate >= %s
    """
    params = [min_engagement_rate]

    if tenant_id is not None:
        query += " AND sp.tenant_id = %s"
        params.append(tenant_id)
    else:
        query += " AND sp.tenant_id IS NOT NULL"

    query += " ORDER BY sp.engagement_rate DESC"

    cursor.execute(query, params)
    rows = cursor.fetchall()

    examples = []
    for row in rows:
        (platform, content, post_type, hashtags, engagement,
         likes, shares, comments, address, city, price, prop_type) = row

        system_prompt = (
            f"You are a real estate social media expert writing for {platform}. "
            f"Create engaging posts that drive interaction and showcase properties. "
            f"Follow Fair Housing guidelines."
        )

        property_context = ""
        if address:
            property_context = (
                f"\nProperty: {address}, {city}\n"
                f"Type: {prop_type} | Price: ${price:,.0f}"
            )

        user_input = (
            f"Write a {post_type} post for {platform}.{property_context}\n"
            f"Include relevant hashtags."
        )

        examples.append(TrainingExample(
            system_prompt=system_prompt,
            user_input=user_input,
            assistant_output=content,
            metadata={
                "type": "social",
                "platform": platform,
                "engagement": engagement,
            },
        ))

    cursor.close()
    count = _write_jsonl(examples, output_dir, "social_posts.jsonl")
    logger.info(f"Exported {count} social post training examples")
    return count


def _write_jsonl(
    examples: list[TrainingExample],
    output_dir: str,
    filename: str,
) -> int:
    """Write training examples to JSONL in OpenAI chat format."""
    Path(output_dir).mkdir(parents=True, exist_ok=True)
    filepath = Path(output_dir) / filename

    count = 0
    with open(filepath, "w") as f:
        for ex in examples:
            record = {
                "messages": [
                    {"role": "system", "content": ex.system_prompt},
                    {"role": "user", "content": ex.user_input},
                    {"role": "assistant", "content": ex.assistant_output},
                ]
            }
            f.write(json.dumps(record) + "\n")
            count += 1

    logger.info(f"Wrote {count} examples to {filepath}")
    return count


def export_all(
    tenant_id: Optional[int] = None,
    output_dir: str = "training_data",
) -> dict[str, int]:
    """Export all training data sources."""
    conn = get_db_connection()

    try:
        results = {
            "listings": export_listing_descriptions(conn, tenant_id, output_dir=output_dir),
            "emails": export_email_campaigns(conn, tenant_id, output_dir=output_dir),
            "social": export_social_posts(conn, tenant_id, output_dir=output_dir),
        }
        total = sum(results.values())
        logger.info(f"Total training examples exported: {total}")
        return results
    finally:
        conn.close()


if __name__ == "__main__":
    logging.basicConfig(level=logging.INFO)

    tenant = int(sys.argv[1]) if len(sys.argv) > 1 else None
    out_dir = sys.argv[2] if len(sys.argv) > 2 else "training_data"

    results = export_all(tenant_id=tenant, output_dir=out_dir)
    print(f"\nExport complete: {results}")
