MobileRead Forums - View Single Post - Custom recipes (archive, read-only)

nickredding · 01-21-2010, 02:18 PM

I should know better than to post in forums before I've finiished my coffee. I was coding the solution to date locales when kovid posted his suggestion. Here is the fixed recipe, which manually decodes the WSJ US locale dateline for comparison. evanmaastrigt--if you could test this in your locale I'd appreciate it.

Code:

#!/usr/bin/env  python

__license__   = 'GPL v3'

'''
online.wsj.com
'''
import string, re
from calibre import strftime
from calibre.web.feeds.recipes import BasicNewsRecipe
from calibre.ebooks.BeautifulSoup import BeautifulSoup, Tag, NavigableString
from datetime import timedelta, datetime, date

class WSJ(BasicNewsRecipe):
    # formatting adapted from original recipe by Kovid Goyal and Sujata Raman
    title          = u'Wall Street Journal (free)'
    __author__     = 'Nick Redding'
    language = 'en'
    description = ('All the free content from the Wall Street Journal (business, financial and political news)')
 
    no_stylesheets = True
    timefmt = ' [%b %d]'

    # customization notes: delete sections you are not interested in
    # set omit_paid_content to False if you want the paid content article snippets
    # set oldest_article to the maximum number of days back from today to include articles
    sectionlist = [
                        ['/home-page','Front Page'],
                        ['/public/page/news-opinion-commentary.html','Commentary'],
                        ['/public/page/news-global-world.html','World News'],
                        ['/public/page/news-world-business.html','US News'],
                        ['/public/page/news-business-us.html','Business'],
                        ['/public/page/news-financial-markets-stock.html','Markets'],
                        ['/public/page/news-tech-technology.html','Technology'],
                        ['/public/page/news-personal-finance.html','Personal Finnce'],
                        ['/public/page/news-lifestyle-arts-entertainment.html','Life & Style'],
                        ['/public/page/news-real-estate-homes.html','Real Estate'],
                        ['/public/page/news-career-jobs.html','Careers'],
                        ['/public/page/news-small-business-marketing.html','Small Business']
                    ]
    oldest_article = 2
    omit_paid_content = True
    
    extra_css   = '''h1{font-size:large; font-family:Times,serif;}
                    h2{font-family:Times,serif; font-size:small; font-style:italic;}
                    .subhead{font-family:Times,serif; font-size:small; font-style:italic;}
                    .insettipUnit {font-family:Times,serif;font-size:xx-small;}
                    .targetCaption{font-size:x-small; font-family:Times,serif; font-style:italic; margin-top: 0.25em;}
                    .article{font-family:Times,serif; font-size:x-small;}
                    .tagline { font-size:xx-small;}
                    .dateStamp {font-family:Times,serif;}
                    h3{font-family:Times,serif; font-size:xx-small;}
                    .byline {font-family:Times,serif; font-size:xx-small; list-style-type: none;}
                    .metadataType-articleCredits {list-style-type: none;}
                    h6{font-family:Times,serif; font-size:small; font-style:italic;}
                    .paperLocation{font-size:xx-small;}'''


    remove_tags_before = dict({'class':re.compile('^articleHeadlineBox')})
    remove_tags =   [   dict({'id':re.compile('^articleTabs_tab_')}),
                        #dict(id=["articleTabs_tab_article", "articleTabs_tab_comments",
                        #         "articleTabs_tab_interactive","articleTabs_tab_video",
                        #         "articleTabs_tab_map","articleTabs_tab_slideshow"]),
			{'class':  ['footer_columns','network','insetCol3wide','interactive','video','slideshow','map',
                                    'insettip','insetClose','more_in', "insetContent",
                        #            'articleTools_bottom','articleTools_bottom mjArticleTools',
                                    'aTools', 'tooltip', 
                                    'adSummary', 'nav-inline','insetFullBracket']},
                        dict({'class':re.compile('^articleTools_bottom')}),
                        dict(rel='shortcut icon')
                    ]
    remove_tags_after = [dict(id="article_story_body"), {'class':"article story"}]
    
    def get_browser(self):
        br = BasicNewsRecipe.get_browser()
        return br


    def preprocess_html(self,soup):

        def decode_us_date(datestr):
            udate = datestr.strip().lower().split()
            m = ['january','february','march','april','may','june','july','august','september','october','november','december'].index(udate[0])+1
            d = int(udate[1])
            y = int(udate[2])
            return date(y,m,d)
        
        # check if article is paid content
        if self.omit_paid_content:
            divtags = soup.findAll('div','tooltip')
            if divtags:
                for divtag in divtags:
                    if divtag.find(text="Subscriber Content"):
                        return None
                    
        # check if article is too old
        datetag = soup.find('li',attrs={'class' : re.compile("^dateStamp")})
        if datetag:
            dateline_string = self.tag_to_string(datetag,False)
            date_items = dateline_string.split(',')
            datestring = date_items[0]+date_items[1]
            article_date = decode_us_date(datestring)
            earliest_date = date.today() - timedelta(days=self.oldest_article)
            if article_date < earliest_date:
                self.log("Skipping article dated %s" % datestring)
                return None
            datetag.parent.extract()

            # place dateline in article heading
            
            bylinetag = soup.find('h3','byline')
            if bylinetag:
                h3bylinetag = bylinetag
            else:
                bylinetag = soup.find('li','byline')
                if bylinetag:
                    h3bylinetag = bylinetag.h3
                    if not h3bylinetag:
                        h3bylinetag = bylinetag
                    bylinetag = bylinetag.parent
            if bylinetag:
                if h3bylinetag.a:
                    bylinetext = 'By '+self.tag_to_string(h3bylinetag.a,False)
                else:
                    bylinetext = self.tag_to_string(h3bylinetag,False)
                h3byline = Tag(soup,'h3',[('class','byline')])
                if bylinetext.isspace() or (bylinetext == ''):
                    h3byline.insert(0,NavigableString(date_items[0]+','+date_items[1]))
                else:
                    h3byline.insert(0,NavigableString(bylinetext+u'\u2014'+date_items[0]+','+date_items[1]))
                bylinetag.replaceWith(h3byline)
            else:                  
                headlinetag = soup.find('div',attrs={'class' : re.compile("^articleHeadlineBox")})
                if headlinetag:
                    dateline = Tag(soup,'h3', [('class','byline')])
                    dateline.insert(0,NavigableString(date_items[0]+','+date_items[1]))
                    headlinetag.insert(len(headlinetag),dateline)
        else: # if no date tag, don't process this page--it's not a news item
            return None
        # This gets rid of the annoying superfluous bullet symbol preceding columnist bylines
        ultag = soup.find('ul',attrs={'class' : 'cMetadata metadataType-articleCredits'})
        if ultag:
            a = ultag.h3
            if a:
                ultag.replaceWith(a)
        return soup

    def parse_index(self):

        articles = {}
        key = None
        ans = []

        def parse_index_page(page_name,page_title):

            def article_title(tag):
                atag = tag.find('h2') # title is usually in an h2 tag
                if not atag: # if not, get text from the a tag
                    atag = tag.find('a',href=True)
                    if not atag:
                        return ''
                    t = self.tag_to_string(atag,False)
                    if t == '':
                        # sometimes the title is in the second a tag
                        atag.extract()
                        atag = tag.find('a',href=True)
                        if not atag:
                            return ''
                        return self.tag_to_string(atag,False)
                    return t
                return self.tag_to_string(atag,False)

            def article_author(tag):
                atag = tag.find('strong') # author is usually in a strong tag
                if not atag:
                     atag = tag.find('h4') # if not, look for an h4 tag
                     if not atag:
                         return ''
                return self.tag_to_string(atag,False)

            def article_summary(tag):
                atag = tag.find('p')
                if not atag:
                    return ''
                subtag = atag.strong
                if subtag:
                    subtag.extract()
                return self.tag_to_string(atag,False)

            def article_url(tag):
                atag = tag.find('a',href=True)
                if not atag:
                    return ''
                url = re.sub(r'\?.*', '', atag['href'])
                return url

            def handle_section_name(tag):
                # turns a tag into a section name with special processing
                # for Wat's News, U.S., World & U.S. and World
                s = self.tag_to_string(tag,False)
                if ("What" in s) and ("News" in s):
                    s = "What's News"
                elif (s == "U.S.") or (s == "World & U.S.") or (s == "World"):
                    s = s + " News"
                return s

                

            mainurl = 'http://online.wsj.com'
            pageurl = mainurl+page_name
            #self.log("Page url %s" % pageurl)
            soup = self.index_to_soup(pageurl)
            # Find each instance of div with class including "headlineSummary"
            for divtag in soup.findAll('div',attrs={'class' : re.compile("^headlineSummary")}):
                # divtag contains all article data as ul's and li's
                # first, check if there is an h3 tag which provides a section name
                stag = divtag.find('h3')
                if stag:
                    if stag.parent['class'] == 'dynamic':
                        # a carousel of articles is too complex to extract a section name
                        # for each article, so we'll just call the section "Carousel"
                        section_name = 'Carousel'
                    else:
                        section_name = handle_section_name(stag)
                else:
                    section_name = "What's News"
                #self.log("div Section %s" % section_name)
                # find each top-level ul in the div
                # we don't restrict to class = newsItem because the section_name
                # sometimes changes via a ul tag inside the div
                for ultag in divtag.findAll('ul',recursive=False):
                    stag = ultag.find('h3')
                    if stag:
                        if stag.parent.name == 'ul':
                            # section name has changed
                            section_name = handle_section_name(stag)
                            #self.log("ul Section %s" % section_name)
                            # delete the h3 tag so it doesn't get in the way
                            stag.extract()
                    # find each top level li in the ul
                    for litag in ultag.findAll('li',recursive=False):
                        stag = litag.find('h3')
                        if stag:
                            # section name has changed
                            section_name = handle_section_name(stag)
                            #self.log("li Section %s" % section_name)
                            # delete the h3 tag so it doesn't get in the way
                            stag.extract()
                        # if there is a ul tag inside the li it is superfluous;
                        # it is probably a list of related articles
                        utag = litag.find('ul')
                        if utag:
                            utag.extract()
                        # now skip paid subscriber articles if desired
                        subscriber_tag = litag.find(text="Subscriber Content")
                        if subscriber_tag:
                                if self.omit_paid_content:
                                    continue             
                                # delete the tip div so it doesn't get in the way
                                tiptag = litag.find("div", { "class" : "tipTargetBox" })
                                if tiptag:
                                    tiptag.extract()
                        h1tag = litag.h1
                        # if there's an h1 tag, it's parent is a div which should replace
                        # the li tag for the analysis
                        if h1tag:
                            litag = h1tag.parent                  
                        h5tag = litag.h5
                        if h5tag:
                            # section mame has changed
                            section_name = self.tag_to_string(h5tag,False)
                            #self.log("h5 Section %s" % section_name)
                            # delete the h5 tag so it doesn't get in the way
                            h5tag.extract()
                        url = article_url(litag)
                        if url == '':
                            continue
                        if url.startswith("/article"):
                            url = mainurl+url
                        if not url.startswith("http://online.wsj.com"):
                            continue
                        if not url.endswith(".html"):
                            continue
                        if 'video' in url:
                            continue
                        title = article_title(litag)
                        if title == '':
                            continue
                        #self.log("URL %s" % url)
                        #self.log("Title %s" % title)
                        pubdate = ''
                        #self.log("Date %s" % pubdate)
                        author = article_author(litag)
                        if author == '':
                            author = section_name
                        elif author == section_name:
                            author = ''
                        else:
                            author = section_name+': '+author
                        #if not author == '':
                        #    self.log("Author %s" % author)
                        description = article_summary(litag)
                        #if not description == '':
                        #    self.log("Description %s" % description)
                        if not articles.has_key(page_title):
                            articles[page_title] = []
                        articles[page_title].append(dict(title=title,url=url,date=pubdate,description=description,author=author,content=''))

    
        for page_name,page_title in self.sectionlist:
            parse_index_page(page_name,page_title)
            ans.append(page_title)

        ans = [(key, articles[key]) for key in ans if articles.has_key(key)]
        return ans