View Single Post
Old 07-06-2015, 10:25 PM   #1
anisotrope
Junior Member
anisotrope began at the beginning.
 
Posts: 5
Karma: 14
Join Date: Jul 2015
Device: none
Question Physics today recipe help

Hello,

I'm trying to create a recipe for Physics Today. While for the articles are behind a paywall, the index is not. I'm having trouble getting beautfiul soup to correctly parse the HTML from the page.

The link that I'm using is: http://scitation.aip.org/content/aip...ize=100&page=1

The recipe that I'm trying to test is below. The problem is that the "div" class="publistwrapper contain" does not show up in the beautiful soup version, it is all cleaned out. Which means that none of the index is actually saved.

How can I actually get all of the HTML to show up in the soup so I can parse it correctly?

Thanks!

Code:
import re
from calibre.web.feeds.news import BasicNewsRecipe
from calibre import strftime

class Physicstoday(BasicNewsRecipe):
    title                 = u'Physics Today (Subscription)'
    __author__            = 'anisotrope'
    description           = u'Physics Today magazine'
    publisher             = 'American Institute of Physics'
    category              = 'Physics'
    language              = 'en'
    oldest_article = 30
    max_articles_per_feed = 100
    no_stylesheets        = True
    use_embedded_content  = False
    needs_subscription = True
    remove_javascript     = True
    remove_tags_before = dict(name='div', attrs={'class':'magazineDescriptionContainer'})
    remove_tags_after = dict(name='div',attrs={'class':'content'})
    remove_tags =  [
        dict(name='div', attrs={'class':'clear articlenav std-display'}),
        dict(name='div', attrs={'class':'pubtopright'}),
        dict(name='div', attrs={'id':'commentsSection'})
    ]

    FRONTPAGE = "https://scitation.aip.org/content/aip/magazine/physicstoday/issues?pageSize=100&page=1"
    INDEX = "https://scitation.aip.org/"
    
    def get_browser(self):
        br = BasicNewsRecipe.get_browser(self)
        if self.username is not None and self.password is not None:
            br.open(self.FRONTPAGE)
            br.select_form(name='signinform')
            br['username'] = self.username
            br['password'] = self.password
            br.submit()
        return br

    def cleanup(self):
        self.browser.open(self.INDEX + "/session/sign-out")
        
    def get_cover_url(self):
        soup = self.index_to_soup(self.FRONTPAGE)
        div = soup.find('div', attrs={'id':'coverImagePdf'})
        img_url =  self.INDEX + self.tag_to_string(div)
        print "Cover url: {}".format(img_url)
        return img_url #The url includes the https:// as necessary

    def parse_index(self):

        answer = []
        soup = self.index_to_soup(self.FRONTPAGE)
        #get dates
        date_element = soup.find('div', attrs = {"class":"issueTitle"})
        print "Date_element: {}".format(date_element)
        date = re.split(',\s',self.tag_to_string(date_element))[2]
        self.title = "Physics Today ({})".format(date)
        self.timefmt =  u' [%s]'%date

        toc_soup = soup.find('div',attrs={"class":re.compile('issueToc')})
        print "Soup: {}".format(soup)
        sec_start = soup.findAll('ul', attrs = {"class":re.compile('issueTocShowhide')})
        #sec_start = soup.findAll('ul', attrs = {"class":"issueTocShowhide"})
        print "Sec_start: {}".format(sec_start)
        for sec in sec_start:
            articles = []
            section = self.tag_to_string(sec.find('li', attrs={"class":"issueTocShowhide"}).span)
            print "Section: " + section
            for div_block in sec.findAll('div', attrs={"class":"articleInToc"}):
            #for div_block in sec.findAll('ul', attrs={"class":re.compile(r'\bsectionContentsList\b')}):
                h5 = div_block.find('h5')
                if h5 is not None:
                    title=self.tag_to_string(h5)
                    article_url = self.INDEX + h5.span.a['href']
                    print "Article url: {}".format(article_url)
                    #url = self.get_print_url(article_url)
                    atr=div_block.findNext('span', attrs = {'class': "meta-value authors"})
                    if atr is not None:
                        author=self.tag_to_string(atr)
                    else:
                        author=''
                    desc=div_block.findNext('div', attrs = {'class': 'derscription contain'})
                    if desc is not None:
                        description=self.tag_to_string(desc.p)
                    else:
                        description=''
                    articles.append({'title':title, 'date':None, 'url':url, 'description':description, 'author':author})
            if articles:
                answer.append((section, articles))
        return answer
anisotrope is offline   Reply With Quote