Why is this failing..
this is simple test code for businessweek.. to check why main one is failing
Code:
from calibre.web.feeds.news import BasicNewsRecipe, classes
from calibre import browser
class bb(BasicNewsRecipe):
title = 'Bloomberg Businessweek test'
language = 'en'
__author__ = "unkn0wn"
no_stylesheets = True
remove_javascript = True
remove_attributes = ['style', 'width', 'height']
keep_only_tags = [
dict(name='div', attrs={'class':lambda x: x.startswith('lede-text__')}),
dict(name='figure', attrs={'class':lambda x: x.startswith('lede-media__')}),
classes('body-content fence-body')
]
def parse_index(self):
soup = self.index_to_soup('https://www.bloomberg.com/magazine/businessweek/22_21')
ans = []
for a in soup.findAll('a', href = lambda x: x.__contains__('/news/features/')):
url = a['href']
if url.startswith('/'):
url = 'https://www.bloomberg.com' + url
return url
title = self.tag_to_string(a)
self.log(title, ' at ', url)
ans.append({'title': title, 'url': url})
return [('Articles', ans)]
# BB changes the content it delivers based on cookies, so the
# following ensures that we send no cookies
def get_browser(self, *args, **kwargs):
return self
def clone_browser(self, *args, **kwargs):
return self.get_browser()
def open_novisit(self, *args, **kwargs):
br = browser()
return br.open_novisit(*args, **kwargs)
open = open_novisit
for a in soup.findAll('a', href = lambda x: x.__contains__('/news/features/')):
this should return at-least 3/4 links.. and i checked with javascript disabled too..
I tried different ways to fetch links.. I dont think calibre is looking at the same soup page that i'm able to see in my browser. how can we know?