Dear Sir,
I am trying to fetch Gujarati news from Sandesh news.
www.sandesh.com
I have written following code.
Code:
#!/usr/bin/env python2
# vim:fileencoding=utf-8
from __future__ import unicode_literals, division, absolute_import, print_function
from calibre.web.feeds.news import BasicNewsRecipe
class FE_India(BasicNewsRecipe):
title = 'Sandesh'
__author__ = 'Parag Soni'
description = 'Sandesh Gujarati'
publisher = 'Sandesh'
category = 'news, politics, finances, India'
oldest_article = 30
max_articles_per_feed = 200
no_stylesheets = True
encoding = 'cp1252'
use_embedded_content = False
language = 'gu_IN'
remove_empty_feeds = True
masthead_url = 'http://www.sandesh.com/IMAGES/Sandesh_Logo.gif'
publication_type = 'magazine'
extra_css = ' body{font-family: Arial,Helvetica,sans-serif } '
conversion_options = {
'comment' : description
, 'tags' : category
, 'publisher' : publisher
, 'language' : language
}
keep_only_tags = [dict(attrs={'class':'txt'})]
remove_attributes = ['width','height']
feeds = [(u'National', u'http://www.sandesh.com/cms/xml/National.xml')]
def print_version(self, url):
match = re.search(r'newsid=(\d+)', url)
if not match:
return url
return 'http://www.sandesh.com/printarticle.aspx?newsid='+match.group(1)
def postprocess_html(self, soup, first_fetch):
for t in soup.findAll(['table', 'tr', 'td']):
t.name = 'div'
a = soup.find(href='http://www.sandesh.com/')
if a is not None:
a.parent.extract()
return soup
but getting headings only not getting full news.
please support me
Thanks in advance
Parag