View Single Post
Old 02-28-2025, 06:38 AM   #1
fengli
Connoisseur
fengli began at the beginning.
 
Posts: 97
Karma: 10
Join Date: Aug 2022
Device: PC
Failed to crawl the full text of Google News RSS, please help me, thank you very much

from calibre.web.feeds.news import BasicNewsRecipe
from calibre.ebooks.BeautifulSoup import BeautifulSoup
import time
Spoiler:

class GoogleNewsFullTextRecipe(BasicNewsRecipe):
title = 'Google News Full Text - Multiple Sources'
description = 'Fetches full-text articles from multiple Google News RSS feeds'
language = 'en'
max_articles_per_feed = 10 # Maximum number of articles per RSS source
oldest_article = 7 # Only fetch articles from the last 7 days
remove_empty_feeds = True
simultaneous_downloads = 5 # Number of concurrent download threads
delay = 3 # Fetch interval (seconds) to avoid being blocked

# Define multiple Google News RSS feeds
feeds = [
('Top Stories', 'https://news.google.com/rss?hl=en-US&gl=US&ceid=US:en'),
('Technology', 'https://news.google.com/rss/topics/CAAqJggKIiBDQkFTRWdvSUwyMHZNRGRqTVhZU0FtVnVHZ0pWVX lnQVAB?hl=en-US&gl=US&ceid=US%3Aen'),
('Science', 'https://news.google.com/rss/topics/CAAqJggKIiBDQkFTRWdvSUwyMHZNRFp0Y1RjU0FtVnVHZ0pWVX lnQVAB?hl=en-US&gl=US&ceid=US%3Aen'),
]

def parse_feeds(self):
"""Parse RSS Source and extract article information"""
feeds = []
for title, url in self.feeds:
try:
soup = self.index_to_soup(url) # Use built-in methods to parse RSS
items = soup.findAll('item')
articles = []
for item in items:
article = {
'title': item.title.text,
'url': item.link.text,
'description': item.description.text if item.description else '',
'date': item.pubDate.text if item.pubDate else ''
}
articles.append(article)
if articles:
feeds.append((title, articles))
except Exception as e:
self.log(f'Error parsing feed {title}: {e}')
return feeds

def get_article_url(self, article):
"""Handle redirect links and get the actual article URL"""
try:
response = self.browser.open_novisit(article['url']) # Use built-in browser tools
return response.geturl()
except Exception as e:
self.log(f'Error resolving URL {article["url"]}: {e}')
return article['url']

def fetch_article(self, url):
"""Get the HTML content of the article"""
try:
response = self.urlopen(url) # Use built-in urlopen method
html = response.read().decode('utf-8')
soup = BeautifulSoup(html, 'html.parser')
return soup
except Exception as e:
self.log(f'Error fetching article {url}: {e}')
return None

def preprocess_html(self, soup):
"""Extract the article body"""
content = soup.find('article') or soup.find('div', class_='article-body') or soup.body
return content if content else soup.body

Last edited by theducks; 02-28-2025 at 06:44 AM. Reason: SPOILER LOG files
fengli is offline   Reply With Quote