![]() |
#1 | |
Member
![]() ![]() ![]() ![]() ![]() ![]() ![]() ![]() Posts: 21
Karma: 1010
Join Date: Dec 2011
Device: sony reader PRS-T1, kobo touch
|
ars technica recipe
I am trying to add these feeds to the ars technica recipe
Code:
(u'Ars Features (All our long-form feature articles)' , u'http://feeds.arstechnica.com/arstechnica/features') (u'Technology Lab (Information Technology)' , u'http://feeds.arstechnica.com/arstechnica/technology-lab/') (u'Ministry of Innovation (Business of Technology)' , u'http://feeds.arstechnica.com/arstechnica/business') (u'Staff Blogs (From the Minds of Ars)' , u'http://feeds.arstechnica.com/arstechnica/staff-blogs') Quote:
complete code list with my mods Code:
__license__ = 'GPL v3' __copyright__ = '2008-2012, Darko Miletic <darko.miletic at gmail.com>' ''' arstechnica.com ''' from calibre.web.feeds.news import BasicNewsRecipe from calibre.ebooks.BeautifulSoup import BeautifulSoup class ArsTechnica(BasicNewsRecipe): title = u'Ars Technica' language = 'en' __author__ = 'Darko Miletic, Sujata Raman, Alexis Rohou,tom sparks' description = 'Ars Technica: Serving the technologist for 1.2 decades' publisher = 'Conde Nast Publications' category = 'news, IT, technology' oldest_article = 5 max_articles_per_feed = 100 no_stylesheets = True encoding = 'utf-8' use_embedded_content = False remove_empty_feeds = True publication_type = 'newsportal' extra_css = ''' body {font-family: Arial,sans-serif} .heading{font-family: "Times New Roman",serif} .byline{font-weight: bold; line-height: 1em; font-size: 0.625em; text-decoration: none} img{display: block} .caption-text{font-size:small; font-style:italic} .caption-byline{font-size:small; font-style:italic; font-weight:bold} ''' conversion_options = { 'comments' : description ,'tags' : category ,'language' : language ,'publisher' : publisher } keep_only_tags = [ dict(attrs={'class':'standalone'}) ,dict(attrs={'id':'article-guts'}) ] remove_tags = [ dict(name=['object','link','embed','iframe','meta']) ,dict(attrs={'class':'corner-info'}) ] remove_attributes = ['lang'] feeds = [ (u'Ars Features (All our long-form feature articles)' , u'http://feeds.arstechnica.com/arstechnica/features') (u'Technology Lab (Information Technology)' , u'http://feeds.arstechnica.com/arstechnica/technology-lab/') (u'Ministry of Innovation (Business of Technology)' , u'http://feeds.arstechnica.com/arstechnica/business') (u'Staff Blogs (From the Minds of Ars)' , u'http://feeds.arstechnica.com/arstechnica/staff-blogs') (u'Infinite Loop (Apple content)' , u'http://feeds.arstechnica.com/arstechnica/apple/') ,(u'Opposable Thumbs (Gaming content)' , u'http://feeds.arstechnica.com/arstechnica/gaming/') ,(u'Gear and Gadgets' , u'http://feeds.arstechnica.com/arstechnica/gadgets/') ,(u'Uptime (IT content)' , u'http://feeds.arstechnica.com/arstechnica/business/') ,(u'Open Ended (Open Source content)' , u'http://feeds.arstechnica.com/arstechnica/open-source/') ,(u'One Microsoft Way' , u'http://feeds.arstechnica.com/arstechnica/microsoft/') ,(u'Scientific method (Science content)' , u'http://feeds.arstechnica.com/arstechnica/science/') ,(u'Law & Disorder (Tech policy content)' , u'http://feeds.arstechnica.com/arstechnica/tech-policy/') ,(u'Risk Assessment (Security content)' , u'http://feeds.arstechnica.com/arstechnica/security/') ] def append_page(self, soup, appendtag, position): pager = soup.find(attrs={'class':'numbers'}) if pager: nexttag = pager.find(attrs={'class':'next'}) if nexttag: nurl = nexttag.parent['href'] rawc = self.index_to_soup(nurl,True) soup2 = BeautifulSoup(rawc, fromEncoding=self.encoding) texttag = soup2.find(attrs={'id':'article-guts'}) newpos = len(texttag.contents) self.append_page(soup2,texttag,newpos) texttag.extract() pager.extract() appendtag.insert(position,texttag) def preprocess_html(self, soup): self.append_page(soup, soup.body, 3) for item in soup.findAll('a'): limg = item.find('img') if item.string is not None: str = item.string item.replaceWith(str) else: if limg: item.name = 'div' item.attrs = [] else: str = self.tag_to_string(item) item.replaceWith(str) for item in soup.findAll('img'): if 'alt' not in item: item['alt'] = 'image' return soup def preprocess_raw_html(self, raw, url): return '<html><head>'+raw[raw.find('</head>'):] Last edited by tom_a_sparks; 05-21-2014 at 10:11 PM. Reason: spelling |
|
![]() |
![]() |
![]() |
#2 |
Member
![]() ![]() ![]() ![]() ![]() ![]() ![]() ![]() Posts: 21
Karma: 1010
Join Date: Dec 2011
Device: sony reader PRS-T1, kobo touch
|
stupid code mistake fixed
![]() Please add to next calibre update ![]() Code:
__license__ = 'GPL v3' __copyright__ = '2008-2012, Darko Miletic <darko.miletic at gmail.com>' ''' arstechnica.com ''' from calibre.web.feeds.news import BasicNewsRecipe from calibre.ebooks.BeautifulSoup import BeautifulSoup class ArsTechnica(BasicNewsRecipe): title = u'Ars Technica' language = 'en' __author__ = 'Darko Miletic, Sujata Raman, Alexis Rohou, Tom Sparks' description = 'Ars Technica: Serving the technologist for 1.2 decades' publisher = 'Conde Nast Publications' category = 'news, IT, technology' oldest_article = 5 max_articles_per_feed = 100 no_stylesheets = True encoding = 'utf-8' use_embedded_content = False remove_empty_feeds = True publication_type = 'newsportal' extra_css = ''' body {font-family: Arial,sans-serif} .heading{font-family: "Times New Roman",serif} .byline{font-weight: bold; line-height: 1em; font-size: 0.625em; text-decoration: none} img{display: block} .caption-text{font-size:small; font-style:italic} .caption-byline{font-size:small; font-style:italic; font-weight:bold} ''' conversion_options = { 'comments' : description ,'tags' : category ,'language' : language ,'publisher' : publisher } keep_only_tags = [ dict(attrs={'class':'standalone'}) ,dict(attrs={'id':'article-guts'}) ] remove_tags = [ dict(name=['object','link','embed','iframe','meta']) ,dict(attrs={'class':'corner-info'}) ] remove_attributes = ['lang'] feeds = [ (u'Ars Features (All our long-form feature articles)' , u'http://feeds.arstechnica.com/arstechnica/features') , (u'Technology Lab (Information Technology)' , u'http://feeds.arstechnica.com/arstechnica/technology-lab/') ,(u'Ministry of Innovation (Business of Technology)' , u'http://feeds.arstechnica.com/arstechnica/business') ,(u'Staff Blogs (From the Minds of Ars)' , u'http://feeds.arstechnica.com/arstechnica/staff-blogs') ,(u'Infinite Loop (Apple content)' , u'http://feeds.arstechnica.com/arstechnica/apple/') ,(u'Opposable Thumbs (Gaming content)' , u'http://feeds.arstechnica.com/arstechnica/gaming/') ,(u'Gear and Gadgets' , u'http://feeds.arstechnica.com/arstechnica/gadgets/') ,(u'Uptime (IT content)' , u'http://feeds.arstechnica.com/arstechnica/business/') ,(u'Open Ended (Open Source content)' , u'http://feeds.arstechnica.com/arstechnica/open-source/') ,(u'One Microsoft Way' , u'http://feeds.arstechnica.com/arstechnica/microsoft/') ,(u'Scientific method (Science content)' , u'http://feeds.arstechnica.com/arstechnica/science/') ,(u'Law & Disorder (Tech policy content)' , u'http://feeds.arstechnica.com/arstechnica/tech-policy/') ,(u'Risk Assessment (Security content)' , u'http://feeds.arstechnica.com/arstechnica/security/') ] def append_page(self, soup, appendtag, position): pager = soup.find(attrs={'class':'numbers'}) if pager: nexttag = pager.find(attrs={'class':'next'}) if nexttag: nurl = nexttag.parent['href'] rawc = self.index_to_soup(nurl,True) soup2 = BeautifulSoup(rawc, fromEncoding=self.encoding) texttag = soup2.find(attrs={'id':'article-guts'}) newpos = len(texttag.contents) self.append_page(soup2,texttag,newpos) texttag.extract() pager.extract() appendtag.insert(position,texttag) def preprocess_html(self, soup): self.append_page(soup, soup.body, 3) for item in soup.findAll('a'): limg = item.find('img') if item.string is not None: str = item.string item.replaceWith(str) else: if limg: item.name = 'div' item.attrs = [] else: str = self.tag_to_string(item) item.replaceWith(str) for item in soup.findAll('img'): if 'alt' not in item: item['alt'] = 'image' return soup def preprocess_raw_html(self, raw, url): return '<html><head>'+raw[raw.find('</head>'):] |
![]() |
![]() |
Advert | |
|
![]() |
Tags |
ars technica recipe |
|
![]() |
||||
Thread | Thread Starter | Forum | Replies | Last Post |
Updated Ars Technica recipe | odie | Recipes | 2 | 05-15-2012 12:52 PM |
Ars Technica recipe update | Alexis | Recipes | 2 | 08-04-2011 05:40 PM |
Ars Technica issues | cypherslock | Calibre | 2 | 01-24-2010 07:27 PM |
Ars.Technica review of the iliad | Antartica | iRex | 3 | 02-19-2008 11:02 AM |
Sony Reader gets a 6/10 from Ars Technica | Alexander Turcic | Sony Reader | 32 | 11-17-2007 06:36 PM |