#!/usr/bin/env python3 """ News Web Scraper Extracts headlines, story text, and images from news sites and generates a standalone HTML page. """ import requests from bs4 import BeautifulSoup from urllib.parse import urljoin, urlparse import os from datetime import datetime import base64 import sys class NewsScraper: def __init__(self, output_dir="output"): self.output_dir = output_dir self.session = requests.Session() self.session.headers.update({ 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36' }) # Create output directory if it doesn't exist if not os.path.exists(self.output_dir): os.makedirs(self.output_dir) def fetch_page(self, url): """Fetch the page content.""" try: response = self.session.get(url, timeout=10) response.raise_for_status() return response.text except requests.RequestException as e: print(f"Error fetching {url}: {e}") return None def download_image(self, img_url, base_url): """Download image and convert to base64 for embedding.""" try: # Convert relative URLs to absolute absolute_url = urljoin(base_url, img_url) response = self.session.get(absolute_url, timeout=10) response.raise_for_status() # Convert to base64 img_data = base64.b64encode(response.content).decode('utf-8') # Detect content type content_type = response.headers.get('content-type', 'image/jpeg') return f"data:{content_type};base64,{img_data}" except Exception as e: print(f"Warning: Could not download image {img_url}: {e}") return None def extract_articles(self, html, base_url, selectors): """ Extract articles from HTML using CSS selectors. Args: html: HTML content base_url: Base URL for resolving relative links selectors: Dictionary with keys: 'article', 'headline', 'text', 'image' Each value is a CSS selector """ soup = BeautifulSoup(html, 'html.parser') articles = [] article_elements = soup.select(selectors.get('article', 'article')) for article_elem in article_elements[:10]: # Limit to 10 articles article_data = {} # Extract headline headline_elem = article_elem.select_one(selectors.get('headline', 'h1, h2, h3')) article_data['headline'] = headline_elem.get_text(strip=True) if headline_elem else "No headline" # Extract text text_elem = article_elem.select_one(selectors.get('text', 'p')) article_data['text'] = text_elem.get_text(strip=True) if text_elem else "No text" # Extract image article_data['image'] = None img_elem = article_elem.select_one(selectors.get('image', 'img')) if img_elem: img_src = img_elem.get('src') or img_elem.get('data-src') if img_src: article_data['image'] = self.download_image(img_src, base_url) articles.append(article_data) return articles def generate_html(self, articles, title, output_file="news.html"): """Generate a standalone HTML page with all articles.""" html_content = f""" {title}

{title}

Generated on {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}

""" for i, article in enumerate(articles, 1): img_html = "" if article.get('image'): img_html = f'

' html_content += f"""

{img_html}

{article.get('headline', 'No headline')}

{article.get('text', 'No text')}

Article #{i}

""" html_content += """

""" # Save to file output_path = os.path.join(self.output_dir, output_file) with open(output_path, 'w', encoding='utf-8') as f: f.write(html_content) print(f"✓ HTML report saved to: {output_path}") return output_path def scrape(self, url, selectors, title="News Articles", output_file="news.html"): """ Main scraping method. Args: url: URL of the news site to scrape selectors: CSS selectors for extracting content title: Title for the HTML report output_file: Name of the output HTML file """ print(f"Fetching {url}...") html = self.fetch_page(url) if not html: print("Failed to fetch the page.") return None print("Extracting articles...") articles = self.extract_articles(html, url, selectors) print(f"Found {len(articles)} articles.") if articles: print("Generating HTML report...") output_path = self.generate_html(articles, title, output_file) return output_path else: print("No articles found. Check your CSS selectors.") return None if __name__ == "__main__": # Example usage print("News Web Scraper") print("=" * 50) # Example: BBC News (you may need to adjust selectors based on current site structure) scraper = NewsScraper(output_dir="output") # You can customize these selectors for different news sites selectors = { 'article': 'article', # Container for each article 'headline': 'h2, h3', # Headline element 'text': 'p', # Story text 'image': 'img' # Image element } # Example: Scrape BBC News url = "https://www.bbc.com/news" # Uncomment to run: # scraper.scrape(url, selectors, title="BBC News", output_file="bbc_news.html") print("\nTo use this scraper:") print("1. Modify the 'url' and 'selectors' variables with your target site") print("2. Run: python scraper.py") print("\nExample with custom selectors:") print(""" scraper = NewsScraper() selectors = { 'article': '.news-item', 'headline': '.headline', 'text': '.story-text', 'image': '.story-image img' } scraper.scrape('https://example-news.com', selectors, title="Example News") """)