View Single Post
Old 09-29-2010, 01:49 PM   #7
TonytheBookworm
Addict
TonytheBookworm is on a distinguished road
 
TonytheBookworm's Avatar
 
Posts: 264
Karma: 62
Join Date: May 2010
Device: kindle 2, kindle 3, Kindle fire
Quote:
Originally Posted by Starson17 View Post
I'm not sure what you're asking. The images appear in the html produced with your code and my changes - they don't appear in your code without them. The img tag appears in my print of the newdiv tag with my changes, but not with your code. Do you want me to post your code with my changes, as tested?
if you don't mind cause i would like to see what I'm doing wrong. thanks. as for the issue at hand. the dang image wrapped around the text Like the original poster mentioned in his screenshot. I figured to solve the problem i would simply remove the tables and then enclose the image tag inside a div tag or p tag. didn't work that well

here is the code i am using:
Spoiler:

Code:
from calibre.web.feeds.recipes import BasicNewsRecipe
from BeautifulSoup import BeautifulSoup, Tag

class RevistaMuyInteresante(BasicNewsRecipe):

    title       = 'Revista Muy Interesante'
    __author__  = 'Jefferson Frantz'
    description = 'Revista de divulgacion'
    timefmt = ' [%d %b, %Y]'
    language = 'es_ES'
    conversion_options = {'linearize_tables' : True}
    keep_only_tags = [dict(name='div', attrs={'class':['article']}),dict(name='td', attrs={'class':['txt_articulo']})]
    remove_tags        = [
                             dict(name=['object','link','script','ul'])
                            ,dict(name='div', attrs={'id':['comment']})
                            ,dict(name='td', attrs={'class':['buttonheading']})
                            ,dict(name='div', attrs={'class':['tags_articles']})
                         ]

    remove_tags_after = dict(name='div', attrs={'class':'tags_articles'})


    


    def nz_parse_section(self, url):
            soup = self.index_to_soup(url)
            div = soup.find(attrs={'class':'contenido'})

            current_articles = []
            for x in div.findAllNext(attrs={'class':['headline']}):
                    a = x.find('a', href=True)
                    if a is None:
                        continue
                    title = self.tag_to_string(a)
                    url = a.get('href', False)
                    if not url or not title:
                        continue
                    if url.startswith('/'):
                         url = 'http://www.muyinteresante.es'+url
                    self.log('\t\tFound article:', title)
                    self.log('\t\t\t', url)
                    current_articles.append({'title': title, 'url':url,
                        'description':'', 'date':''})

            return current_articles


    def parse_index(self):
            feeds = []
            for title, url in [
                ('Historia',
                 'http://www.muyinteresante.es/historia-articulos'),
             ]:
               articles = self.nz_parse_section(url)
               if articles:
                   feeds.append((title, articles))
            return feeds
    
    def preprocess_html(self, soup):
        
        for img_tag in soup.findAll('img'):
            parent_tag = img_tag.parent
            data = img_tag
            new_img_tag = Tag(soup,'img')
            new_img_tag['src'] = img_tag['src']
            data = new_img_tag
           
            
            newdiv = Tag(soup,'div')
            newtag = Tag(soup,'p')
            newtag.insert(0,data)
            newdiv.insert(0,newtag)
            parent_tag.insert(0,newdiv)
            print 'parent tag is: ', parent_tag
            print 'newdiv is: ', newdiv
            print 'data is: ',data
            print 'newtag is: ', newtag
            print 'the soup is: ', soup
            
            
            
        return soup
    
    def postprocess_html(self, soup, first):
        for tag in soup.findAll(name=['table', 'tr', 'td']):
            tag.name = 'div'
        return soup

Last edited by TonytheBookworm; 09-29-2010 at 01:51 PM. Reason: code added
TonytheBookworm is offline   Reply With Quote