View Single Post
Old 06-01-2011, 11:56 AM   #1
schuster
Zealot
schuster doesn't litterschuster doesn't litter
 
Posts: 119
Karma: 100
Join Date: Jan 2011
Location: Germany / NRW /Köln
Device: prs-650 / prs-350 /kindle 3
need help / duplicate entries in news

hi,
i have made a recipe for the latest news from local police.
it is working good but i have the entries double in the mobi file.
don't know why, could anyone help me?

Code:
from calibre.web.feeds.news import BasicNewsRecipe
from calibre.ebooks.BeautifulSoup import Tag, NavigableString
class AdvancedUserRecipe(BasicNewsRecipe):

    title      = u'polizei_test'
    __author__ = u'schuster'
    masthead_url = 'http://www.presseportal.de/showbin.htx?id=65851&type=logo'
    cover_url = 'http://www.polizei-nrw.de/rhein-kreis-neuss/stepone/data/images/95/02/00/200_rheinkreis_neuer_stern.jpg'
    language = 'DE'
    INDEX                  = 'http://www.presseportal.de/polizeipresse/p_story.htx?search=grevenbroich&firmaid=65851'

    remove_tags = [
        dict(name='div', attrs={'id':'logo'}),
        dict(name='div', attrs={'id':'origin'}),
        dict(name='pre', attrs={'class':'xml_contact'})]



    no_stylesheets = True
    remove_javascript     = True


    def parse_index(self):
        articles = []
        soup = self.index_to_soup(self.INDEX)
        cover = None
        feeds = []
        for section in soup.findAll('div', attrs={'class':'storylist_item'}):
            section_title = self.tag_to_string(section.find(name='h3', attrs={'class':'title'}))
            articles = []
            for post in section.findAll('a', href=True):
                url = post['href']
                if url.startswith('/polizeipresse/pm/65851/2'):
                  url = 'http://www.presseportal.de'+url
                  title = self.tag_to_string(post)
                  if str(post).find('class=') > 0:
                    klass = post['class']
                    if klass != "":
                      self.log()
                      self.log('--> post:  ', post)
                      self.log('--> url:   ', url)
                      self.log('--> title: ', title)
                      self.log('--> class: ', klass)
                      articles.append({'title':title, 'url':url})
            if articles:
                feeds.append((section_title, articles))
        return feeds


    def print_version(self,url):
        segments = url.split('/')
        printURL = '/'.join(segments[:3]) + '/print.htx?nr=' + '/'.join(segments[6:7]) + '&type=polizei'
        return printURL

solved: forget to set "if klass != "more":"

Last edited by schuster; 06-01-2011 at 03:14 PM. Reason: clear now / after some coffee
schuster is offline   Reply With Quote