
    1i#&                        d Z ddlZddlmZ ddlmZmZ ddlZddlmZ ddl	Z	ddl
Z
 G d d      Zedk(  rK ed	        ed
        ed      ZdddddZdZ ed        ed        ed        ed        ed       yy)zs
News Web Scraper
Extracts headlines, story text, and images from news sites and generates a standalone HTML page.
    N)BeautifulSoup)urljoinurlparse)datetimec                   6    e Zd ZddZd Zd Zd Zd	dZd
dZy)NewsScraperc                 $   || _         t        j                         | _        | j                  j                  j                  ddi       t        j                  j                  | j                         s t        j                  | j                          y y )Nz
User-Agentz<Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36)

output_dirrequestsSessionsessionheadersupdateospathexistsmakedirs)selfr
   s     '/var/www/site01/news-scraper/scraper.py__init__zNewsScraper.__init__   sg    $'')##X%
 	
 ww~~doo.KK( /    c                     	 | j                   j                  |d      }|j                          |j                  S # t        j
                  $ r}t        d| d|        Y d}~yd}~ww xY w)zFetch the page content.
   timeoutzError fetching : N)r   getraise_for_statustextr   RequestExceptionprint)r   urlresponsees       r   
fetch_pagezNewsScraper.fetch_page   sd    	||''R'8H%%'== (( 	OC51#./	s   8; A)A$$A)c                 h   	 t        ||      }| j                  j                  |d      }|j                          t	        j
                  |j                        j                  d      }|j                  j                  dd      }d| d| S # t        $ r}t        d| d	|        Y d
}~y
d
}~ww xY w)z3Download image and convert to base64 for embedding.r   r   utf-8zcontent-typez
image/jpegzdata:z;base64,z"Warning: Could not download image r   N)r   r   r   r   base64	b64encodecontentdecoder   	Exceptionr!   )r   img_urlbase_urlabsolute_urlr#   img_datacontent_typer$   s           r   download_imagezNewsScraper.download_image&   s    	"8W5L||''b'AH%%' ''(8(89@@IH $++//ML<.
;; 	6wir!EF	s   B
B 	B1B,,B1c                 L   t        |d      }g }|j                  |j                  dd            }|dd D ]  }i }|j                  |j                  dd            }	|	r|	j	                  d      nd	|d<   |j                  |j                  d
d            }
|
r|
j	                  d      nd|d
<   d|d<   |j                  |j                  dd            }|r;|j                  d      xs |j                  d      }|r| j                  ||      |d<   |j                  |        |S )a:  
        Extract articles from HTML using CSS selectors.
        
        Args:
            html: HTML content
            base_url: Base URL for resolving relative links
            selectors: Dictionary with keys: 'article', 'headline', 'text', 'image'
                      Each value is a CSS selector
        zhtml.parserarticleNr   headlinez
h1, h2, h3T)stripNo headliner   pNo textimageimgsrczdata-src)r   selectr   
select_oneget_textr2   append)r   htmlr.   	selectorssouparticlesarticle_elementsarticle_elemarticle_dataheadline_elem	text_elemimg_elemimg_srcs                r   extract_articleszNewsScraper.extract_articles:   s1    T=1;;y}}Y	'JK,Sb1 	*LL )33IMM*l4[\MMZ}'='=D'='I`mL$ %//	fc0JKIEN9#5#5D#5#AT]L  %)L!#..y}}We/LMH",,u-Ij1I,0,?,?,RL)OOL)'	** r   c                    d| d| dt        j                         j                  d       d}t        |d      D ]S  \  }}d}|j	                  d      r	d	|d    d
}|d| d|j	                  dd       d|j	                  dd       d| d	z  }U |dz  }t
        j                  j                  | j                  |      }t        |dd      5 }	|	j                  |       ddd       t        d|        |S # 1 sw Y   xY w)z2Generate a standalone HTML page with all articles.z<!DOCTYPE html>
<html lang="en">
<head>
    <meta charset="UTF-8">
    <meta name="viewport" content="width=device-width, initial-scale=1.0">
    <title>ax	  </title>
    <style>
        * {
            margin: 0;
            padding: 0;
            box-sizing: border-box;
        }
        
        body {
            font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', Roboto, Oxygen, Ubuntu, Cantarell, sans-serif;
            background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
            padding: 20px;
            min-height: 100vh;
        }
        
        .container {
            max-width: 900px;
            margin: 0 auto;
        }
        
        header {
            text-align: center;
            color: white;
            margin-bottom: 40px;
            padding: 20px 0;
        }
        
        header h1 {
            font-size: 2.5em;
            margin-bottom: 10px;
            text-shadow: 2px 2px 4px rgba(0, 0, 0, 0.3);
        }
        
        header p {
            font-size: 1em;
            opacity: 0.9;
        }
        
        .articles {
            display: grid;
            gap: 20px;
        }
        
        article {
            background: white;
            border-radius: 8px;
            overflow: hidden;
            box-shadow: 0 10px 30px rgba(0, 0, 0, 0.2);
            transition: transform 0.3s ease, box-shadow 0.3s ease;
        }
        
        article:hover {
            transform: translateY(-5px);
            box-shadow: 0 15px 40px rgba(0, 0, 0, 0.3);
        }
        
        .article-image {
            width: 100%;
            height: 300px;
            object-fit: cover;
            background: #f0f0f0;
        }
        
        .article-content {
            padding: 20px;
        }
        
        .article-headline {
            font-size: 1.5em;
            font-weight: 600;
            color: #333;
            margin-bottom: 10px;
            line-height: 1.3;
        }
        
        .article-text {
            font-size: 1em;
            color: #666;
            line-height: 1.6;
            margin-bottom: 10px;
        }
        
        .article-meta {
            font-size: 0.85em;
            color: #999;
            border-top: 1px solid #eee;
            padding-top: 10px;
        }
        
        footer {
            text-align: center;
            color: white;
            margin-top: 40px;
            padding: 20px 0;
            opacity: 0.8;
        }
    </style>
</head>
<body>
    <div class="container">
        <header>
            <h1>z"</h1>
            <p>Generated on z%Y-%m-%d %H:%M:%Sz?</p>
        </header>
        
        <div class="articles">
    r:   z
<img src="z," alt="Article image" class="article-image">z&            <article>
                z`
                <div class="article-content">
                    <h2 class="article-headline">r5   r7   z2</h2>
                    <p class="article-text">r   r9   z<</p>
                    <div class="article-meta">Article #z5</div>
                </div>
            </article>
z        </div>
        
        <footer>
            <p>News Scraper - Standalone HTML Report</p>
        </footer>
    </div>
</body>
</html>
wr'   )encodingNu   ✓ HTML report saved to: )r   nowstrftime	enumerater   r   r   joinr
   openwriter!   )
r   rD   titleoutput_filehtml_contentir4   img_htmloutput_pathfs
             r   generate_htmlzNewsScraper.generate_html`   sS   
 7 eJ  %\\^445HIJ KWob $Ha0 	JAwH{{7#'(8'99ef !
 229++j-2X1Y Z--4[[-K,L M889s ; L	 	  	 ggll4??K@+sW5 	"GGL!	" 	*;-89		" 	"s   C77D c                 &   t        d| d       | j                  |      }|st        d       yt        d       | j                  |||      }t        dt        |       d       |r t        d       | j	                  |||      }|S t        d	       y)
a  
        Main scraping method.
        
        Args:
            url: URL of the news site to scrape
            selectors: CSS selectors for extracting content
            title: Title for the HTML report
            output_file: Name of the output HTML file
        z	Fetching z...zFailed to fetch the page.NzExtracting articles...zFound z
 articles.zGenerating HTML report...z,No articles found. Check your CSS selectors.)r!   r%   rL   lenr_   )r   r"   rB   rX   rY   rA   rD   r]   s           r   scrapezNewsScraper.scrape   s     		#c"#s#-.&'((sI>s8}oZ01-.,,XukJK@Ar   N)output)	news.html)zNews Articlesrd   )	__name__
__module____qualname__r   r%   r2   rL   r_   rb    r   r   r   r      s#    	)($LRhr   r   __main__zNews Web Scraperz2==================================================rc   )r
   r4   zh2, h3r8   r;   )r4   r5   r   r:   zhttps://www.bbc.com/newsz
To use this scraper:zC1. Modify the 'url' and 'selectors' variables with your target sitez2. Run: python scraper.pyz
Example with custom selectors:a  
    scraper = NewsScraper()
    selectors = {
        'article': '.news-item',
        'headline': '.headline',
        'text': '.story-text',
        'image': '.story-image img'
    }
    scraper.scrape('https://example-news.com', selectors, title="Example News")
    )__doc__r   bs4r   urllib.parser   r   r   r   r(   sysr   re   r!   scraperrB   r"   rh   r   r   <module>ro      s   
   * 	   
@ @F z	
	(O X.G 	I %C
 

"#	
OP	
%&	
,-	 	 		5 r   