#!/usr/bin/env python3
"""
News Web Scraper
Extracts headlines, story text, and images from news sites and generates a standalone HTML page.
"""

import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin, urlparse
import os
from datetime import datetime
import base64
import sys


class NewsScraper:
    def __init__(self, output_dir="output"):
        self.output_dir = output_dir
        self.session = requests.Session()
        self.session.headers.update({
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
        })
        
        # Create output directory if it doesn't exist
        if not os.path.exists(self.output_dir):
            os.makedirs(self.output_dir)
    
    def fetch_page(self, url):
        """Fetch the page content."""
        try:
            response = self.session.get(url, timeout=10)
            response.raise_for_status()
            return response.text
        except requests.RequestException as e:
            print(f"Error fetching {url}: {e}")
            return None
    
    def download_image(self, img_url, base_url):
        """Download image and convert to base64 for embedding."""
        try:
            # Convert relative URLs to absolute
            absolute_url = urljoin(base_url, img_url)
            
            response = self.session.get(absolute_url, timeout=10)
            response.raise_for_status()
            
            # Convert to base64
            img_data = base64.b64encode(response.content).decode('utf-8')
            
            # Detect content type
            content_type = response.headers.get('content-type', 'image/jpeg')
            
            return f"data:{content_type};base64,{img_data}"
        except Exception as e:
            print(f"Warning: Could not download image {img_url}: {e}")
            return None
    
    def extract_articles(self, html, base_url, selectors):
        """
        Extract articles from HTML using CSS selectors.
        
        Args:
            html: HTML content
            base_url: Base URL for resolving relative links
            selectors: Dictionary with keys: 'article', 'headline', 'text', 'image'
                      Each value is a CSS selector
        """
        soup = BeautifulSoup(html, 'html.parser')
        articles = []
        
        article_elements = soup.select(selectors.get('article', 'article'))
        
        for article_elem in article_elements[:10]:  # Limit to 10 articles
            article_data = {}
            
            # Extract headline
            headline_elem = article_elem.select_one(selectors.get('headline', 'h1, h2, h3'))
            article_data['headline'] = headline_elem.get_text(strip=True) if headline_elem else "No headline"
            
            # Extract text
            text_elem = article_elem.select_one(selectors.get('text', 'p'))
            article_data['text'] = text_elem.get_text(strip=True) if text_elem else "No text"
            
            # Extract image
            article_data['image'] = None
            img_elem = article_elem.select_one(selectors.get('image', 'img'))
            if img_elem:
                img_src = img_elem.get('src') or img_elem.get('data-src')
                if img_src:
                    article_data['image'] = self.download_image(img_src, base_url)
            
            articles.append(article_data)
        
        return articles
    
    def generate_html(self, articles, title, output_file="news.html"):
        """Generate a standalone HTML page with all articles."""
        html_content = f"""<!DOCTYPE html>
<html lang="en">
<head>
    <meta charset="UTF-8">
    <meta name="viewport" content="width=device-width, initial-scale=1.0">
    <title>{title}</title>
    <style>
        * {{
            margin: 0;
            padding: 0;
            box-sizing: border-box;
        }}
        
        body {{
            font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', Roboto, Oxygen, Ubuntu, Cantarell, sans-serif;
            background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
            padding: 20px;
            min-height: 100vh;
        }}
        
        .container {{
            max-width: 900px;
            margin: 0 auto;
        }}
        
        header {{
            text-align: center;
            color: white;
            margin-bottom: 40px;
            padding: 20px 0;
        }}
        
        header h1 {{
            font-size: 2.5em;
            margin-bottom: 10px;
            text-shadow: 2px 2px 4px rgba(0, 0, 0, 0.3);
        }}
        
        header p {{
            font-size: 1em;
            opacity: 0.9;
        }}
        
        .articles {{
            display: grid;
            gap: 20px;
        }}
        
        article {{
            background: white;
            border-radius: 8px;
            overflow: hidden;
            box-shadow: 0 10px 30px rgba(0, 0, 0, 0.2);
            transition: transform 0.3s ease, box-shadow 0.3s ease;
        }}
        
        article:hover {{
            transform: translateY(-5px);
            box-shadow: 0 15px 40px rgba(0, 0, 0, 0.3);
        }}
        
        .article-image {{
            width: 100%;
            height: 300px;
            object-fit: cover;
            background: #f0f0f0;
        }}
        
        .article-content {{
            padding: 20px;
        }}
        
        .article-headline {{
            font-size: 1.5em;
            font-weight: 600;
            color: #333;
            margin-bottom: 10px;
            line-height: 1.3;
        }}
        
        .article-text {{
            font-size: 1em;
            color: #666;
            line-height: 1.6;
            margin-bottom: 10px;
        }}
        
        .article-meta {{
            font-size: 0.85em;
            color: #999;
            border-top: 1px solid #eee;
            padding-top: 10px;
        }}
        
        footer {{
            text-align: center;
            color: white;
            margin-top: 40px;
            padding: 20px 0;
            opacity: 0.8;
        }}
    </style>
</head>
<body>
    <div class="container">
        <header>
            <h1>{title}</h1>
            <p>Generated on {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}</p>
        </header>
        
        <div class="articles">
"""
        
        for i, article in enumerate(articles, 1):
            img_html = ""
            if article.get('image'):
                img_html = f'<img src="{article["image"]}" alt="Article image" class="article-image">'
            
            html_content += f"""            <article>
                {img_html}
                <div class="article-content">
                    <h2 class="article-headline">{article.get('headline', 'No headline')}</h2>
                    <p class="article-text">{article.get('text', 'No text')}</p>
                    <div class="article-meta">Article #{i}</div>
                </div>
            </article>
"""
        
        html_content += """        </div>
        
        <footer>
            <p>News Scraper - Standalone HTML Report</p>
        </footer>
    </div>
</body>
</html>
"""
        
        # Save to file
        output_path = os.path.join(self.output_dir, output_file)
        with open(output_path, 'w', encoding='utf-8') as f:
            f.write(html_content)
        
        print(f"✓ HTML report saved to: {output_path}")
        return output_path
    
    def scrape(self, url, selectors, title="News Articles", output_file="news.html"):
        """
        Main scraping method.
        
        Args:
            url: URL of the news site to scrape
            selectors: CSS selectors for extracting content
            title: Title for the HTML report
            output_file: Name of the output HTML file
        """
        print(f"Fetching {url}...")
        html = self.fetch_page(url)
        
        if not html:
            print("Failed to fetch the page.")
            return None
        
        print("Extracting articles...")
        articles = self.extract_articles(html, url, selectors)
        
        print(f"Found {len(articles)} articles.")
        
        if articles:
            print("Generating HTML report...")
            output_path = self.generate_html(articles, title, output_file)
            return output_path
        else:
            print("No articles found. Check your CSS selectors.")
            return None


if __name__ == "__main__":
    # Example usage
    print("News Web Scraper")
    print("=" * 50)
    
    # Example: BBC News (you may need to adjust selectors based on current site structure)
    scraper = NewsScraper(output_dir="output")
    
    # You can customize these selectors for different news sites
    selectors = {
        'article': 'article',          # Container for each article
        'headline': 'h2, h3',          # Headline element
        'text': 'p',                   # Story text
        'image': 'img'                 # Image element
    }
    
    # Example: Scrape BBC News
    url = "https://www.bbc.com/news"
    
    # Uncomment to run:
    # scraper.scrape(url, selectors, title="BBC News", output_file="bbc_news.html")
    
    print("\nTo use this scraper:")
    print("1. Modify the 'url' and 'selectors' variables with your target site")
    print("2. Run: python scraper.py")
    print("\nExample with custom selectors:")
    print("""
    scraper = NewsScraper()
    selectors = {
        'article': '.news-item',
        'headline': '.headline',
        'text': '.story-text',
        'image': '.story-image img'
    }
    scraper.scrape('https://example-news.com', selectors, title="Example News")
    """)
