View Single Post
Old 05-23-2011, 02:53 PM   #5
schuster
Zealot
schuster doesn't litterschuster doesn't litter
 
Posts: 119
Karma: 100
Join Date: Jan 2011
Location: Germany / NRW /Köln
Device: prs-650 / prs-350 /kindle 3
hi starson, I hope I can also ask this questions?

you are right. but i don't understand it.
i'm experimenting without success.

Code:
class AdvancedUserRecipe1305567197(BasicNewsRecipe):
    title          = u'Focus - test'
    __author__  = 'for_test'
    oldest_article = 20
    max_articles_per_feed = 10
    no_stylesheets         = True
    use_embedded_content   = False
    remove_javascript      = True
    

    def get_article_url(self, article):
        return article.get('id', article.get('guid', None))


    def append_page(self, soup, appendtag, position):
        pager = soup.find('a',attrs={'class':'nextPage greyButton'}) # here is pager
        if pager:
           nexturl = self.INDEX + pager.a['href']
           soup2 = self.index_to_soup(nexturl)
           texttag = soup2.find('div', attrs={'class':'textBlock'}) # here is text
           for it in texttag.findAll(style=True):
               del it['style']
           newpos = len(texttag.contents)
           self.append_page(soup2,texttag,newpos)
           texttag.extract()
           appendtag.insert(position,texttag)


    def preprocess_html(self, soup):
        for item in soup.findAll(style=True):
            del item['style']
        for item in soup.findAll('span', attrs={'class':'overhead'}): # here is bevor textblock
            item.extract()
        self.append_page(soup, soup.body, 3)
        pager = soup.find('div',attrs={'class':'pageCounter'}) # this is pager on next side
        if pager:
           pager.extract()
        return self.adeify_images(soup)


    feeds          = [	(u'Eilmeldungen', u'http://rss2.focus.de/c/32191/f/533875/index.rss'),
                                        (u'Wissen-News', u'http://rss2.focus.de/c/32191/f/533876/index.rss')]

# feed with multipage in "wissen-news":
# Ozonloch-Studie - Zwischen Euphorie und Hysterie
is this right? but i've got no luck to grab it.
it grabs only the normal pages, the multipages are lost.

greetings
schuster is offline   Reply With Quote