Hello,
I'm trying to create a recipe for Physics Today. While for the articles are behind a paywall, the index is not. I'm having trouble getting beautfiul soup to correctly parse the HTML from the page.
The link that I'm using is:
http://scitation.aip.org/content/aip...ize=100&page=1
The recipe that I'm trying to test is below. The problem is that the "div" class="publistwrapper contain" does not show up in the beautiful soup version, it is all cleaned out. Which means that none of the index is actually saved.

How can I actually get all of the HTML to show up in the soup so I can parse it correctly?
Thanks!
Code:
import re
from calibre.web.feeds.news import BasicNewsRecipe
from calibre import strftime
class Physicstoday(BasicNewsRecipe):
title = u'Physics Today (Subscription)'
__author__ = 'anisotrope'
description = u'Physics Today magazine'
publisher = 'American Institute of Physics'
category = 'Physics'
language = 'en'
oldest_article = 30
max_articles_per_feed = 100
no_stylesheets = True
use_embedded_content = False
needs_subscription = True
remove_javascript = True
remove_tags_before = dict(name='div', attrs={'class':'magazineDescriptionContainer'})
remove_tags_after = dict(name='div',attrs={'class':'content'})
remove_tags = [
dict(name='div', attrs={'class':'clear articlenav std-display'}),
dict(name='div', attrs={'class':'pubtopright'}),
dict(name='div', attrs={'id':'commentsSection'})
]
FRONTPAGE = "https://scitation.aip.org/content/aip/magazine/physicstoday/issues?pageSize=100&page=1"
INDEX = "https://scitation.aip.org/"
def get_browser(self):
br = BasicNewsRecipe.get_browser(self)
if self.username is not None and self.password is not None:
br.open(self.FRONTPAGE)
br.select_form(name='signinform')
br['username'] = self.username
br['password'] = self.password
br.submit()
return br
def cleanup(self):
self.browser.open(self.INDEX + "/session/sign-out")
def get_cover_url(self):
soup = self.index_to_soup(self.FRONTPAGE)
div = soup.find('div', attrs={'id':'coverImagePdf'})
img_url = self.INDEX + self.tag_to_string(div)
print "Cover url: {}".format(img_url)
return img_url #The url includes the https:// as necessary
def parse_index(self):
answer = []
soup = self.index_to_soup(self.FRONTPAGE)
#get dates
date_element = soup.find('div', attrs = {"class":"issueTitle"})
print "Date_element: {}".format(date_element)
date = re.split(',\s',self.tag_to_string(date_element))[2]
self.title = "Physics Today ({})".format(date)
self.timefmt = u' [%s]'%date
toc_soup = soup.find('div',attrs={"class":re.compile('issueToc')})
print "Soup: {}".format(soup)
sec_start = soup.findAll('ul', attrs = {"class":re.compile('issueTocShowhide')})
#sec_start = soup.findAll('ul', attrs = {"class":"issueTocShowhide"})
print "Sec_start: {}".format(sec_start)
for sec in sec_start:
articles = []
section = self.tag_to_string(sec.find('li', attrs={"class":"issueTocShowhide"}).span)
print "Section: " + section
for div_block in sec.findAll('div', attrs={"class":"articleInToc"}):
#for div_block in sec.findAll('ul', attrs={"class":re.compile(r'\bsectionContentsList\b')}):
h5 = div_block.find('h5')
if h5 is not None:
title=self.tag_to_string(h5)
article_url = self.INDEX + h5.span.a['href']
print "Article url: {}".format(article_url)
#url = self.get_print_url(article_url)
atr=div_block.findNext('span', attrs = {'class': "meta-value authors"})
if atr is not None:
author=self.tag_to_string(atr)
else:
author=''
desc=div_block.findNext('div', attrs = {'class': 'derscription contain'})
if desc is not None:
description=self.tag_to_string(desc.p)
else:
description=''
articles.append({'title':title, 'date':None, 'url':url, 'description':description, 'author':author})
if articles:
answer.append((section, articles))
return answer