#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""
Modul Preprocessing untuk Artikel Baru
"""

import re
import string
from typing import List, Dict, Tuple
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords

# Download NLTK data jika belum ada
def ensure_nltk_data():
    """Pastikan NLTK data tersedia"""
    required_data = [
        ('tokenizers/punkt', 'punkt'),
        ('tokenizers/punkt_tab', 'punkt_tab'),
        ('corpora/stopwords', 'stopwords')
    ]
    
    for path, name in required_data:
        try:
            nltk.data.find(path)
        except LookupError:
            try:
                print(f"Downloading NLTK {name}...")
                nltk.download(name, quiet=True)
            except:
                # Fallback untuk punkt_tab
                if name == 'punkt_tab':
                    try:
                        nltk.download('punkt', quiet=True)
                    except:
                        pass

# Setup NLTK data
ensure_nltk_data()

class ArticlePreprocessor:
    """
    Class untuk preprocessing artikel berita
    """
    
    def __init__(self, language='indonesian'):
        self.language = language
        try:
            self.stop_words = set(stopwords.words('indonesian'))
        except:
            self.stop_words = set()
        
        # Pattern untuk cleaning
        self.url_pattern = re.compile(r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+')
        self.email_pattern = re.compile(r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b')
        self.phone_pattern = re.compile(r'(\+62|62|0)[\s-]?[0-9]{2,4}[\s-]?[0-9]{3,4}[\s-]?[0-9]{3,4}')
        
    def clean_text(self, text: str) -> str:
        """
        Bersihkan teks dari noise
        """
        if not text:
            return ""
        
        # Hapus URL
        text = self.url_pattern.sub('', text)
        
        # Hapus email
        text = self.email_pattern.sub('', text)
        
        # Hapus nomor telepon
        text = self.phone_pattern.sub('', text)
        
        # Hapus karakter HTML entities
        html_entities = {
            '&amp;': '&',
            '&lt;': '<',
            '&gt;': '>',
            '&quot;': '"',
            '&#39;': "'",
            '&nbsp;': ' '
        }
        
        for entity, replacement in html_entities.items():
            text = text.replace(entity, replacement)
        
        # Hapus extra whitespace
        text = re.sub(r'\s+', ' ', text)
        
        # Trim
        text = text.strip()
        
        return text
    
    def segment_sentences(self, text: str) -> List[str]:
        """
        Segmentasi teks menjadi kalimat
        """
        # Clean text first
        text = self.clean_text(text)
        
        if not text:
            return []
        
        # Tokenize sentences
        sentences = sent_tokenize(text)
        
        # Filter kalimat yang terlalu pendek atau kosong
        filtered_sentences = []
        for sentence in sentences:
            sentence = sentence.strip()
            if len(sentence) > 10 and len(sentence.split()) > 3:  # Minimal 10 karakter dan 3 kata
                filtered_sentences.append(sentence)
        
        return filtered_sentences
    
    def tokenize_sentence(self, sentence: str) -> List[str]:
        """
        Tokenize kalimat menjadi kata-kata
        """
        # Clean sentence
        sentence = self.clean_text(sentence)
        
        # Tokenize
        tokens = word_tokenize(sentence.lower())
        
        # Filter tokens
        filtered_tokens = []
        for token in tokens:
            # Hapus punctuation dan token yang terlalu pendek
            if (token not in string.punctuation and 
                len(token) > 1 and 
                token.isalpha()):
                filtered_tokens.append(token)
        
        return filtered_tokens
    
    def preprocess_article(self, raw_text: str) -> Dict[str, any]:
        """
        Preprocess artikel lengkap
        
        Args:
            raw_text (str): Teks artikel mentah
            
        Returns:
            Dict berisi:
            - sentences: List kalimat yang sudah dibersihkan
            - tokenized_sentences: List kalimat dalam format token
            - word_count: Jumlah kata total
            - sentence_count: Jumlah kalimat
        """
        # Segmentasi kalimat
        sentences = self.segment_sentences(raw_text)
        
        # Tokenize setiap kalimat
        tokenized_sentences = []
        total_words = 0
        
        for sentence in sentences:
            tokens = self.tokenize_sentence(sentence)
            tokenized_sentences.append(tokens)
            total_words += len(tokens)
        
        return {
            'sentences': sentences,
            'tokenized_sentences': tokenized_sentences,
            'word_count': total_words,
            'sentence_count': len(sentences),
            'raw_text': raw_text
        }
    
    def extract_article_features(self, sentences: List[str]) -> Dict[str, any]:
        """
        Extract fitur-fitur dari artikel untuk analisis
        """
        if not sentences:
            return {
                'avg_sentence_length': 0,
                'max_sentence_length': 0,
                'min_sentence_length': 0,
                'total_characters': 0,
                'has_numbers': False,
                'has_quotes': False
            }
        
        # Hitung statistik kalimat
        sentence_lengths = [len(sentence.split()) for sentence in sentences]
        
        # Cek keberadaan elemen khusus
        full_text = ' '.join(sentences)
        has_numbers = bool(re.search(r'\d', full_text))
        has_quotes = '"' in full_text or "'" in full_text
        
        return {
            'avg_sentence_length': sum(sentence_lengths) / len(sentence_lengths),
            'max_sentence_length': max(sentence_lengths),
            'min_sentence_length': min(sentence_lengths),
            'total_characters': len(full_text),
            'has_numbers': has_numbers,
            'has_quotes': has_quotes,
            'sentence_lengths': sentence_lengths
        }
    
    def format_for_model(self, processed_article: Dict[str, any]) -> List[str]:
        """
        Format artikel yang sudah dipreprocess untuk input model
        
        Args:
            processed_article: Output dari preprocess_article()
            
        Returns:
            List[str]: List kalimat siap untuk model
        """
        return processed_article['sentences']

class NewsArticleExtractor:
    """
    Class untuk extract konten artikel dari HTML berita
    """
    
    def __init__(self):
        self.preprocessor = ArticlePreprocessor()
        
        # Pattern untuk mendeteksi konten artikel
        self.content_indicators = [
            'article', 'content', 'story', 'news-content', 
            'post-content', 'entry-content', 'main-content'
        ]
        
        # Pattern untuk menghapus elemen yang tidak diinginkan
        self.noise_patterns = [
            r'Baca juga:.*',
            r'ADVERTISEMENT.*',
            r'Iklan.*',
            r'Sumber:.*',
            r'Editor:.*',
            r'Reporter:.*',
            r'Foto:.*',
            r'Video:.*',
            r'Lihat Foto.*',
            r'KOMPAS\.com.*',
            r'Liputan6\.com.*',
            r'\(.*\)$',  # Text dalam kurung di akhir kalimat
        ]
    
    def clean_article_content(self, text: str) -> str:
        """
        Bersihkan konten artikel dari noise
        """
        if not text:
            return ""
        
        # Hapus noise patterns
        for pattern in self.noise_patterns:
            text = re.sub(pattern, '', text, flags=re.IGNORECASE)
        
        # Hapus multiple newlines
        text = re.sub(r'\n+', '\n', text)
        
        # Hapus extra spaces
        text = re.sub(r'\s+', ' ', text)
        
        return text.strip()
    
    def extract_title_and_content(self, html_content: str) -> Dict[str, str]:
        """
        Extract title dan content dari HTML
        (Implementasi sederhana, bisa diperbaiki dengan BeautifulSoup)
        """
        # Implementasi basic - dalam praktik nyata gunakan BeautifulSoup
        # Ini hanya untuk demo
        
        # Extract title (cari tag title atau h1)
        title_match = re.search(r'<title[^>]*>(.*?)</title>', html_content, re.IGNORECASE | re.DOTALL)
        title = title_match.group(1) if title_match else ""
        
        # Clean title
        title = re.sub(r'<[^>]+>', '', title)
        title = title.strip()
        
        # Extract content (hapus semua HTML tags)
        content = re.sub(r'<[^>]+>', ' ', html_content)
        content = self.clean_article_content(content)
        
        return {
            'title': title,
            'content': content
        }
    
    def process_news_article(self, html_content: str) -> Dict[str, any]:
        """
        Process artikel berita lengkap dari HTML
        """
        # Extract title dan content
        extracted = self.extract_title_and_content(html_content)
        
        # Preprocess content
        processed_content = self.preprocessor.preprocess_article(extracted['content'])
        
        # Extract features
        features = self.preprocessor.extract_article_features(processed_content['sentences'])
        
        return {
            'title': extracted['title'],
            'content': extracted['content'],
            'processed': processed_content,
            'features': features,
            'sentences_for_model': self.preprocessor.format_for_model(processed_content)
        }

def main():
    """
    Test preprocessing functions
    """
    # Test data
    sample_text = """
    Jakarta - Presiden Joko Widodo mengumumkan kebijakan ekonomi digital baru hari ini. 
    Kebijakan ini bertujuan untuk meningkatkan daya saing Indonesia di era digital. 
    
    Menurut Presiden, transformasi digital adalah kunci untuk mencapai Indonesia Emas 2045. 
    "Kita harus bergerak cepat dalam mengadopsi teknologi digital," ujar Jokowi dalam konferensi pers.
    
    Para ahli ekonomi memberikan respons positif terhadap kebijakan ini. 
    Dr. Budi Santoso dari Universitas Indonesia mengatakan bahwa langkah ini sangat tepat waktu.
    
    Implementasi kebijakan akan dimulai pada kuartal pertama tahun depan.
    Baca juga: Artikel terkait lainnya di Liputan6.com
    """
    
    # Test preprocessor
    print("=== Testing Article Preprocessor ===")
    preprocessor = ArticlePreprocessor()
    
    processed = preprocessor.preprocess_article(sample_text)
    
    print(f"Jumlah kalimat: {processed['sentence_count']}")
    print(f"Jumlah kata: {processed['word_count']}")
    print("\nKalimat-kalimat:")
    for i, sentence in enumerate(processed['sentences']):
        print(f"{i+1}. {sentence}")
    
    # Test features
    features = preprocessor.extract_article_features(processed['sentences'])
    print(f"\nFitur artikel:")
    for key, value in features.items():
        if key != 'sentence_lengths':
            print(f"  {key}: {value}")
    
    # Test news extractor
    print("\n=== Testing News Article Extractor ===")
    extractor = NewsArticleExtractor()
    
    sample_html = f"""
    <html>
    <head><title>Jokowi Umumkan Kebijakan Digital - Liputan6.com</title></head>
    <body>
    <div class="content">
    {sample_text}
    </div>
    </body>
    </html>
    """
    
    news_processed = extractor.process_news_article(sample_html)
    
    print(f"Title: {news_processed['title']}")
    print(f"Sentences for model: {len(news_processed['sentences_for_model'])}")
    print("Ready for summarization!")

if __name__ == "__main__":
    main()
