Quote:
Originally Posted by bhandarisaurabh
|
This should work for the
CURRENT ARTICLE MONTH/YEAR
It has a form that you select the different year but I'm not sure what the actual true urls are that it uses on that. So I just stuck with the current month year since I figured that is what you would want anyway. If you will look even though September 2010 is selected on the page the article content still says August 18 or whatever. That is the same date that is on the original page.
Anyway the only thing that I don't understand how to do is get the description to drop the text that is inside the <a>. Once that is done I will post an update.
Updated Code to do descr correctly
Spoiler:
Code:
from calibre.web.feeds.news import BasicNewsRecipe
from calibre.ebooks.BeautifulSoup import BeautifulSoup, re
class IW(BasicNewsRecipe):
title = 'Industry Week'
__author__ = 'Tonythebookworm'
description = ''
language = 'en'
no_stylesheets = True
publisher = 'Tonythebookworm'
category = 'Manufactoring'
use_embedded_content= False
no_stylesheets = True
oldest_article = 40
remove_javascript = True
remove_empty_feeds = True
max_articles_per_feed = 200 # only gets the first 200 articles
INDEX = 'http://www.industryweek.com'
remove_tags = [dict(name='div', attrs={'class':['crumbNav']}),
dict(name='i')]
def parse_index(self):
feeds = []
for title, url in [
(u"Current Month", u"http://www.industryweek.com/Archive.aspx"),
]:
articles = self.make_links(url)
if articles:
feeds.append((title, articles))
return feeds
def make_links(self, url):
title = 'Temp'
current_articles = []
soup = self.index_to_soup(url)
for item in soup.findAll('a', attrs={'class':'article'}):
link = item['href']
soup = self.index_to_soup(url)
if link:
url = self.INDEX + link
title = self.tag_to_string(item)
descr = item.parent
item.extract()
descr = self.tag_to_string(descr)
#print 'the url is: ', url
#print 'the title is: ', title
#print 'the descr is: ', descr
current_articles.append({'title': title, 'url': url, 'description': descr, 'date':''}) # append all this
return current_articles
def print_version(self, url):
split1 = url.split("=")
print_url = 'http://www.industryweek.com/PrintArticle.aspx?ArticleID=' + split1[1]
return print_url