Failed to crawl the full text of Google News RSS, please help me, thank you very much - MobileRead Forums

		MobileRead Forums > E-Book Software > Calibre > Recipes
Failed to crawl the full text of Google News RSS, please help me, thank you very much

Reply

Thread Tools

Search this Thread

02-28-2025, 06:38 AM	#1
fengli Zealot Posts: 111 Karma: 39846 Join Date: Aug 2022 Device: PC	Failed to crawl the full text of Google News RSS, please help me, thank you very much from calibre.web.feeds.news import BasicNewsRecipe from calibre.ebooks.BeautifulSoup import BeautifulSoup import time Spoiler: class GoogleNewsFullTextRecipe(BasicNewsRecipe): title = 'Google News Full Text - Multiple Sources' description = 'Fetches full-text articles from multiple Google News RSS feeds' language = 'en' max_articles_per_feed = 10 # Maximum number of articles per RSS source oldest_article = 7 # Only fetch articles from the last 7 days remove_empty_feeds = True simultaneous_downloads = 5 # Number of concurrent download threads delay = 3 # Fetch interval (seconds) to avoid being blocked # Define multiple Google News RSS feeds feeds = [ ('Top Stories', 'https://news.google.com/rss?hl=en-US&gl=US&ceid=US:en'), ('Technology', 'https://news.google.com/rss/topics/CAAqJggKIiBDQkFTRWdvSUwyMHZNRGRqTVhZU0FtVnVHZ0pWVX lnQVAB?hl=en-US&gl=US&ceid=US%3Aen'), ('Science', 'https://news.google.com/rss/topics/CAAqJggKIiBDQkFTRWdvSUwyMHZNRFp0Y1RjU0FtVnVHZ0pWVX lnQVAB?hl=en-US&gl=US&ceid=US%3Aen'), ] def parse_feeds(self): """Parse RSS Source and extract article information""" feeds = [] for title, url in self.feeds: try: soup = self.index_to_soup(url) # Use built-in methods to parse RSS items = soup.findAll('item') articles = [] for item in items: article = { 'title': item.title.text, 'url': item.link.text, 'description': item.description.text if item.description else '', 'date': item.pubDate.text if item.pubDate else '' } articles.append(article) if articles: feeds.append((title, articles)) except Exception as e: self.log(f'Error parsing feed {title}: {e}') return feeds def get_article_url(self, article): """Handle redirect links and get the actual article URL""" try: response = self.browser.open_novisit(article['url']) # Use built-in browser tools return response.geturl() except Exception as e: self.log(f'Error resolving URL {article["url"]}: {e}') return article['url'] def fetch_article(self, url): """Get the HTML content of the article""" try: response = self.urlopen(url) # Use built-in urlopen method html = response.read().decode('utf-8') soup = BeautifulSoup(html, 'html.parser') return soup except Exception as e: self.log(f'Error fetching article {url}: {e}') return None def preprocess_html(self, soup): """Extract the article body""" content = soup.find('article') or soup.find('div', class_='article-body') or soup.body return content if content else soup.body Last edited by theducks; 02-28-2025 at 06:44 AM. Reason: SPOILER LOG files

02-28-2025, 06:41 AM	#2
fengli Zealot Posts: 111 Karma: 39846 Join Date: Aug 2022 Device: PC	Error message: Spoiler: calibre, version 7.26.0 (win32, embedded-python: True) Conversion options changed from defaults: output_profile: 'generic_eink' verbose: 2 Resolved conversion options calibre version: 7.26.0 {'add_alt_text_to_img': False, 'asciiize': False, 'author_sort': None, 'authors': None, 'base_font_size': 0, 'book_producer': None, 'change_justification': 'original', 'chapter': None, 'chapter_mark': 'pagebreak', 'comments': None, 'cover': None, 'debug_pipeline': None, 'dehyphenate': True, 'delete_blank_paragraphs': True, 'disable_font_rescaling': False, 'dont_download_recipe': False, 'dont_split_on_page_breaks': True, 'duplicate_links_in_toc': False, 'embed_all_fonts': False, 'embed_font_family': None, 'enable_heuristics': False, 'epub_flatten': False, 'epub_inline_toc': False, 'epub_max_image_size': 'none', 'epub_toc_at_end': False, 'epub_version': '2', 'expand_css': False, 'extra_css': None, 'extract_to': None, 'filter_css': None, 'fix_indents': True, 'flow_size': 260, 'font_size_mapping': None, 'format_scene_breaks': True, 'html_unwrap_factor': 0.4, 'input_encoding': None, 'input_profile': <calibre.customize.profiles.InputProfile object at 0x0000025824561890>, 'insert_blank_line': False, 'insert_blank_line_size': 0.5, 'insert_metadata': False, 'isbn': None, 'italicize_common_cases': True, 'keep_ligatures': False, 'language': None, 'level1_toc': None, 'level2_toc': None, 'level3_toc': None, 'line_height': 0, 'linearize_tables': False, 'lrf': False, 'margin_bottom': 5.0, 'margin_left': 5.0, 'margin_right': 5.0, 'margin_top': 5.0, 'markup_chapter_headings': True, 'max_toc_links': 50, 'minimum_line_height': 120.0, 'no_chapters_in_toc': False, 'no_default_epub_cover': False, 'no_inline_navbars': False, 'no_svg_cover': False, 'output_profile': <calibre.customize.profiles.GenericEink object at 0x0000025824590E50>, 'page_breaks_before': None, 'prefer_metadata_cover': False, 'preserve_cover_aspect_ratio': False, 'pretty_print': True, 'pubdate': None, 'publisher': None, 'rating': None, 'read_metadata_from_opf': None, 'recipe_specific_option': None, 'remove_fake_margins': True, 'remove_first_image': False, 'remove_paragraph_spacing': False, 'remove_paragraph_spacing_indent_size': 1.5, 'renumber_headings': True, 'replace_scene_breaks': '', 'search_replace': None, 'series': None, 'series_index': None, 'smarten_punctuation': False, 'sr1_replace': '', 'sr1_search': '', 'sr2_replace': '', 'sr2_search': '', 'sr3_replace': '', 'sr3_search': '', 'start_reading_at': None, 'subset_embedded_fonts': False, 'tags': None, 'test': False, 'timestamp': None, 'title': None, 'title_sort': None, 'toc_filter': None, 'toc_threshold': 6, 'toc_title': None, 'transform_css_rules': None, 'transform_html_rules': None, 'unsmarten_punctuation': False, 'unwrap_lines': True, 'use_auto_toc': False, 'verbose': 2} InputFormatPlugin: Recipe Input running Downloading recipe urn: custom:1276 Using user agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36 Using proxies: {'http': '127.0.0.1:7890', 'https': '127.0.0.1:7890', 'ftp': 'http://127.0.0.1:7890'} Synthesizing mastheadImage Traceback (most recent call last): File "runpy.py", line 198, in _run_module_as_main File "runpy.py", line 88, in _run_code File "site.py", line 83, in <module> File "site.py", line 78, in main File "site.py", line 50, in run_entry_point File "calibre\utils\ipc\worker.py", line 215, in main File "calibre\gui2\convert\gui_conversion.py", line 31, in gui_convert_recipe File "calibre\gui2\convert\gui_conversion.py", line 25, in gui_convert File "calibre\ebooks\conversion\plumber.py", line 1128, in run File "calibre\customize\conversion.py", line 242, in __call__ File "calibre\ebooks\conversion\plugins\recipe_input.py ", line 153, in convert File "calibre\web\feeds\news.py", line 1121, in download File "calibre\web\feeds\news.py", line 1326, in build_index File "calibre\web\feeds\news.py", line 1158, in feeds2index File "calibre\web\feeds\templates.py", line 51, in generate File "calibre\web\feeds\templates.py", line 105, in _generate AttributeError: 'tuple' object has no attribute 'title' Last edited by theducks; 02-28-2025 at 06:45 AM. Reason: SPOILER LOG files

Advert

04-06-2025, 07:26 AM	#3
fengli Zealot Posts: 111 Karma: 39846 Join Date: Aug 2022 Device: PC	can anyone help me，thanks very much

04-06-2025, 09:10 AM	#4
unkn0wn Guru Posts: 649 Karma: 85520 Join Date: May 2021 Device: kindle	you must figure out how it goes from google feed url to actual article url and then replicate this in recipe code.

04-11-2025, 10:02 AM	#5
fengli Zealot Posts: 111 Karma: 39846 Join Date: Aug 2022 Device: PC	I don't know how to code. You helped me make one before, but it doesn't work now. Can you help me fix it? Thank you very much. Please see my latest reply post. Thank you

Advert

Reply

« Previous Thread | Next Thread »

Forum Jump

Similar Threads
Thread	Thread Starter	Forum	Replies	Last Post
Focus (DE) Crawl failed, please take a look, thank you	fengli	Recipes	2	08-14-2023 08:36 PM
Bloomberg Weekly also failed, unable to crawl content, full of invalid icons	fengli	Recipes	12	07-19-2023 02:25 AM
PC word Crawl failed	fengli	Recipes	4	01-06-2023 03:08 AM
On Feedbooks: viewing full news article through RSS feeds	edercito	Amazon Kindle	7	07-24-2009 02:23 AM
Missing features: Gutenberg, Google Books, Google News, open RSS aggregator	Charbax	Amazon Kindle	10	11-22-2007 08:22 PM

All times are GMT -4. The time now is 03:25 AM.