Quote:
Originally Posted by Starson17
I'm not sure what you're asking. The images appear in the html produced with your code and my changes - they don't appear in your code without them. The img tag appears in my print of the newdiv tag with my changes, but not with your code. Do you want me to post your code with my changes, as tested?
|
if you don't mind cause i would like to see what I'm doing wrong. thanks. as for the issue at hand. the dang image wrapped around the text Like the original poster mentioned in his screenshot. I figured to solve the problem i would simply remove the tables and then enclose the image tag inside a div tag or p tag. didn't work that well
here is the code i am using:
Spoiler:
Code:
from calibre.web.feeds.recipes import BasicNewsRecipe
from BeautifulSoup import BeautifulSoup, Tag
class RevistaMuyInteresante(BasicNewsRecipe):
title = 'Revista Muy Interesante'
__author__ = 'Jefferson Frantz'
description = 'Revista de divulgacion'
timefmt = ' [%d %b, %Y]'
language = 'es_ES'
conversion_options = {'linearize_tables' : True}
keep_only_tags = [dict(name='div', attrs={'class':['article']}),dict(name='td', attrs={'class':['txt_articulo']})]
remove_tags = [
dict(name=['object','link','script','ul'])
,dict(name='div', attrs={'id':['comment']})
,dict(name='td', attrs={'class':['buttonheading']})
,dict(name='div', attrs={'class':['tags_articles']})
]
remove_tags_after = dict(name='div', attrs={'class':'tags_articles'})
def nz_parse_section(self, url):
soup = self.index_to_soup(url)
div = soup.find(attrs={'class':'contenido'})
current_articles = []
for x in div.findAllNext(attrs={'class':['headline']}):
a = x.find('a', href=True)
if a is None:
continue
title = self.tag_to_string(a)
url = a.get('href', False)
if not url or not title:
continue
if url.startswith('/'):
url = 'http://www.muyinteresante.es'+url
self.log('\t\tFound article:', title)
self.log('\t\t\t', url)
current_articles.append({'title': title, 'url':url,
'description':'', 'date':''})
return current_articles
def parse_index(self):
feeds = []
for title, url in [
('Historia',
'http://www.muyinteresante.es/historia-articulos'),
]:
articles = self.nz_parse_section(url)
if articles:
feeds.append((title, articles))
return feeds
def preprocess_html(self, soup):
for img_tag in soup.findAll('img'):
parent_tag = img_tag.parent
data = img_tag
new_img_tag = Tag(soup,'img')
new_img_tag['src'] = img_tag['src']
data = new_img_tag
newdiv = Tag(soup,'div')
newtag = Tag(soup,'p')
newtag.insert(0,data)
newdiv.insert(0,newtag)
parent_tag.insert(0,newdiv)
print 'parent tag is: ', parent_tag
print 'newdiv is: ', newdiv
print 'data is: ',data
print 'newtag is: ', newtag
print 'the soup is: ', soup
return soup
def postprocess_html(self, soup, first):
for tag in soup.findAll(name=['table', 'tr', 'td']):
tag.name = 'div'
return soup