https://github.com/kovidgoyal/calibr...ivemint.recipe
Code:
def preprocess_raw_html(self, raw, *a):
if '<script>var wsjFlag=true;</script>' in raw:
m = re.search(r'type="application/ld\+json">[^<]+?"@type": "NewsArticle"', raw)
raw1 = raw[m.start():]
raw1 = raw1.split('>', 1)[1].strip()
data = json.JSONDecoder().raw_decode(raw1)[0]
value = data['hasPart']['value']
body = data['articleBody'] + '</p> <p>' + re.sub(r'([a-z]\.|[0-9]\.)([A-Z])', r'\1 <p> \2', value)
body = '<div class="FirstEle"> <p>' + body + '</p> </div>'
raw = re.sub(r'<div class="FirstEle">([^}]*)</div>', body, raw)
return raw
else:
return raw
add this to the else: part (non saturday).. some wsj articles wont load.
and this to extra_css =
.summary{font-style:italic; color:#404040;} of same part.
and resolve_internal_links = True