css
Code:
extra_css = '''
article-sidebar{font-family:Georgia,"Times New Roman",Times,serif; border:ridge; text-align:left;}
[close-caption]{ border:ridge; font-size:small; text-align:center;}
article-ideainbrief{font-family:Georgia,"Times New Roman",Times,serif; text-align:left; font-style:italic; }
.article-byline-list{font-size:small;}
.credits--hero-image{font-size:small;}
.credits--inline-image{font-size:small;}
.caption--inline-image{font-size:small;}
.description-text{font-size:small; color:gray;}
.right-rail--container{font-size:small; color:#4c4c4c;}
.link--black{font-size:small;}
.article-callout{color:#4c4c4c; text-align:center;}
.slug-content{color:gray;}
'''
keep tags: remove '
pub-date' from classes and add bold parts from below
Code:
keep_only_tags = [
classes(
'headline-container hero-image-content article-summary article-body standard-content'
'article-dek-group article-dek slug-container'
),
dict(name='article-sidebar'),
]
and..
Code:
def preprocess_html(self, soup):
for slug in soup.findAll(**classes('slug-content')):
del slug['href']
for dek in soup.findAll(**classes('article-byline')):
for by in dek.findAll('span', attrs={'class':'by-prefix'}):
by.extract()
for li in dek.findAll('li'):
li.name = 'span'
for h2 in soup.findAll(('h2','h3')):
h2.name = 'h5'
return soup