#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""
Web Scraper untuk Artikel Liputan6.com
"""

import requests
from bs4 import BeautifulSoup
import re
import time
from typing import Dict, Optional, List
from urllib.parse import urlparse, urljoin
import logging

# Setup logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

class Liputan6Scraper:
    """
    Web scraper khusus untuk artikel dari Liputan6.com
    """
    
    def __init__(self):
        self.base_url = "https://www.liputan6.com"
        self.session = requests.Session()
        
        # Set headers untuk menghindari blocking
        self.session.headers.update({
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
            'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
            'Accept-Language': 'id-ID,id;q=0.9,en;q=0.8',
            'Accept-Encoding': 'gzip, deflate, br',
            'Connection': 'keep-alive',
            'Upgrade-Insecure-Requests': '1',
        })
        
        # Selectors untuk elemen artikel Liputan6
        self.selectors = {
            'title': [
                'h1.read-page--header--title',
                'h1[data-testid="article-title"]',
                'h1.article-title',
                'h1',
                '.entry-title'
            ],
            'content': [
                '.article-content-body__item-content',
                '.read-page--content-body',
                '.article-content',
                '.entry-content',
                '.content-body'
            ],
            'author': [
                '.read-page--header--author',
                '.article-author',
                '.author-name'
            ],
            'date': [
                '.read-page--header--datetime',
                '.article-date',
                '.published-date'
            ]
        }
    
    def validate_liputan6_url(self, url: str) -> bool:
        """
        Validasi apakah URL adalah dari Liputan6.com
        """
        try:
            parsed_url = urlparse(url)
            return 'liputan6.com' in parsed_url.netloc.lower()
        except:
            return False
    
    def get_page_content(self, url: str, timeout: int = 30) -> Optional[BeautifulSoup]:
        """
        Ambil konten halaman web
        """
        try:
            logger.info(f"Mengambil konten dari: {url}")
            
            response = self.session.get(url, timeout=timeout)
            response.raise_for_status()
            
            # Parse dengan BeautifulSoup
            soup = BeautifulSoup(response.content, 'html.parser')
            return soup
            
        except requests.exceptions.RequestException as e:
            logger.error(f"Error mengambil halaman: {e}")
            return None
        except Exception as e:
            logger.error(f"Error parsing halaman: {e}")
            return None
    
    def extract_element_by_selectors(self, soup: BeautifulSoup, selectors: List[str]) -> Optional[str]:
        """
        Extract elemen menggunakan multiple selectors
        """
        for selector in selectors:
            try:
                element = soup.select_one(selector)
                if element:
                    return element.get_text(strip=True)
            except:
                continue
        return None
    
    def extract_article_content(self, soup: BeautifulSoup) -> List[str]:
        """
        Extract konten artikel menjadi paragraf-paragraf
        """
        content_paragraphs = []
        
        # Coba berbagai selector untuk konten
        for selector in self.selectors['content']:
            try:
                content_container = soup.select_one(selector)
                if content_container:
                    # Ambil semua paragraf dalam container
                    paragraphs = content_container.find_all(['p', 'div'], recursive=True)
                    
                    for p in paragraphs:
                        text = p.get_text(strip=True)
                        if text and len(text) > 20:  # Filter paragraf yang terlalu pendek
                            # Bersihkan teks dari noise
                            text = self.clean_paragraph_text(text)
                            if text:
                                content_paragraphs.append(text)
                    
                    if content_paragraphs:
                        break
                        
            except Exception as e:
                logger.warning(f"Error dengan selector {selector}: {e}")
                continue
        
        # Jika tidak ada konten ditemukan, coba ambil semua paragraf
        if not content_paragraphs:
            logger.warning("Tidak ditemukan konten dengan selector khusus, mencoba selector umum...")
            paragraphs = soup.find_all('p')
            
            for p in paragraphs:
                text = p.get_text(strip=True)
                if text and len(text) > 20:
                    text = self.clean_paragraph_text(text)
                    if text:
                        content_paragraphs.append(text)
        
        return content_paragraphs
    
    def clean_paragraph_text(self, text: str) -> str:
        """
        Bersihkan teks paragraf dari noise
        """
        if not text:
            return ""
        
        # Pattern untuk menghapus noise
        noise_patterns = [
            r'Baca [Jj]uga\s*:.*',
            r'BACA JUGA\s*:.*',
            r'Lihat [Ff]oto\s*:.*',
            r'LIHAT FOTO\s*:.*',
            r'Sumber\s*:.*',
            r'Editor\s*:.*',
            r'Reporter\s*:.*',
            r'Foto\s*:.*',
            r'Video\s*:.*',
            r'ADVERTISEMENT.*',
            r'Iklan.*',
            r'^\([^)]*\)$',  # Text yang hanya berisi kurung
            r'^\d+\.\s*$',   # Hanya nomor
        ]
        
        for pattern in noise_patterns:
            text = re.sub(pattern, '', text, flags=re.IGNORECASE | re.MULTILINE)
        
        # Bersihkan whitespace berlebih
        text = re.sub(r'\s+', ' ', text)
        text = text.strip()
        
        # Filter teks yang terlalu pendek atau hanya berisi karakter khusus
        if len(text) < 10 or not re.search(r'[a-zA-Z]', text):
            return ""
        
        return text
    
    def scrape_article(self, url: str) -> Dict[str, any]:
        """
        Scrape artikel lengkap dari URL Liputan6
        """
        result = {
            'url': url,
            'success': False,
            'title': '',
            'content': '',
            'paragraphs': [],
            'author': '',
            'date': '',
            'error': None
        }
        
        try:
            # Validasi URL
            if not self.validate_liputan6_url(url):
                result['error'] = "URL bukan dari Liputan6.com"
                return result
            
            # Ambil konten halaman
            soup = self.get_page_content(url)
            if not soup:
                result['error'] = "Gagal mengambil konten halaman"
                return result
            
            # Extract title
            title = self.extract_element_by_selectors(soup, self.selectors['title'])
            if title:
                result['title'] = title
            else:
                logger.warning("Title tidak ditemukan")
            
            # Extract content
            paragraphs = self.extract_article_content(soup)
            if paragraphs:
                result['paragraphs'] = paragraphs
                result['content'] = '\n\n'.join(paragraphs)
                result['success'] = True
            else:
                result['error'] = "Konten artikel tidak ditemukan"
                return result
            
            # Extract metadata (optional)
            author = self.extract_element_by_selectors(soup, self.selectors['author'])
            if author:
                result['author'] = author
            
            date = self.extract_element_by_selectors(soup, self.selectors['date'])
            if date:
                result['date'] = date
            
            logger.info(f"Berhasil scrape artikel: {title[:50]}...")
            
        except Exception as e:
            logger.error(f"Error scraping artikel: {e}")
            result['error'] = str(e)
        
        return result
    
    def scrape_multiple_articles(self, urls: List[str], delay: float = 1.0) -> List[Dict[str, any]]:
        """
        Scrape multiple artikel dengan delay
        """
        results = []
        
        for i, url in enumerate(urls):
            logger.info(f"Scraping artikel {i+1}/{len(urls)}")
            
            result = self.scrape_article(url)
            results.append(result)
            
            # Delay untuk menghindari rate limiting
            if i < len(urls) - 1:
                time.sleep(delay)
        
        return results

def main():
    """
    Test web scraper
    """
    scraper = Liputan6Scraper()
    
    # Test URLs (ganti dengan URL artikel Liputan6 yang valid)
    test_urls = [
        "https://www.liputan6.com/news/read/5678901/contoh-artikel-1",
        "https://www.liputan6.com/bisnis/read/5678902/contoh-artikel-2"
    ]
    
    print("=== Testing Liputan6 Scraper ===")
    
    # Test single article
    print("\n--- Test Single Article ---")
    
    # Untuk demo, kita buat URL dummy
    dummy_url = "https://www.liputan6.com/news/read/1/batas-penyerahan-aset-dua-pekan-lagi"
    
    print(f"Testing URL validation:")
    print(f"  Valid Liputan6 URL: {scraper.validate_liputan6_url(dummy_url)}")
    print(f"  Invalid URL: {scraper.validate_liputan6_url('https://example.com/news')}")
    
    # Note: Untuk testing nyata, gunakan URL artikel yang benar-benar ada
    print(f"\nUntuk testing scraping nyata, gunakan URL artikel Liputan6 yang valid.")
    print(f"Contoh penggunaan:")
    print(f"  result = scraper.scrape_article('https://www.liputan6.com/news/read/...')")
    print(f"  if result['success']:")
    print(f"      print('Title:', result['title'])")
    print(f"      print('Content:', result['content'][:200] + '...')")

if __name__ == "__main__":
    main()

