import json
import re
from typing import Dict, List, Any


class VATIKADataProcessor:
    def __init__(self):
        self.domains = [
            'ganga_aarti', 'cruise', 'food_court', 'public_toilet',
            'kund', 'museum', 'general', 'ashram', 'temple', 'travel'
        ]

    def load_json_data(self, file_path: str) -> Dict[str, Any]:
        """Load JSON data from file"""
        try:
            with open(file_path, 'r', encoding='utf-8') as f:
                return json.load(f)
        except Exception as e:
            print(f"Error loading {file_path}: {e}")
            return {}

    def extract_contexts_and_qas(self, data: Dict[str, Any]) -> List[Dict[str, Any]]:
        """Extract contexts and QAs from the dataset"""
        extracted_data = []

        if 'domains' not in data:
            return extracted_data

        for domain_data in data['domains']:
            domain = domain_data['domain']

            for context_data in domain_data['contexts']:
                context = context_data['context']
                qas = context_data['qas']

                extracted_data.append({
                    'domain': domain,
                    'context': context,
                    'qas': qas
                })

        return extracted_data

    def preprocess_text(self, text: str) -> str:
        """Preprocess Hindi text"""
        # Remove extra whitespaces
        text = re.sub(r'\s+', ' ', text)
        # Remove special characters but keep Hindi characters
        text = re.sub(r'[^\w\s\u0900-\u097F।]', ' ', text)
        return text.strip()

    def create_training_examples(self, contexts_qas: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
        """Create training examples for the model"""
        training_examples = []

        for item in contexts_qas:
            domain = item['domain']
            context = self.preprocess_text(item['context'])

            for qa in item['qas']:
                question = self.preprocess_text(qa['question'])
                answer = self.preprocess_text(qa['answer'])

                training_examples.append({
                    'id': qa['id'],
                    'domain': domain,
                    'context': context,
                    'question': question,
                    'answer': answer
                })

        return training_examples