MobileRead Forums - View Single Post

gambarini · 05-13-2011, 04:37 AM

Hy Starson17

this is the entire recipe.
my previous post is incorrect.
I obtain the entire page, like you, and it's equal than the page obtained with "VIEW SOURCE".
My problem is:
i am not able to find anything in the page.
i have tried various combinatio of attribute but with no results.

PHP Code:


			
#!/usr/bin/env  python
__license__   = 'GPL v3'
__author__    = 'Lorenzo Vigentini, based on Darko Miletic, Gabriele Marini'
__copyright__ = '2009, Darko Miletic <darko.miletic at gmail.com>, Lorenzo Vigentini <l.vigentini at gmail.com>'
description   = 'Italian daily newspaper - v1.01 (04, January 2010); 16.05.2010 new version'
'''
http://rassegnastampa.mef.gov.it/mefnazionale/Default.aspx
'''

from calibre.web.feeds.news import BasicNewsRecipe

class RassegnaMefParseIndex(BasicNewsRecipe):
    author        = 'Marini Gabriele'
    description   = 'Rassegna Stampa MEV'

    cover_url      = 'http://rassegnastampa.mef.gov.it/Mef/sorg_n/nazionale.jpg'
    title          = u'Rassegna MEF'
    publisher      = 'Ministero Economia e Finanze'
    category       = 'News, politics, culture, economy, general interest'

    language       = 'it'
    timefmt        = '[%a, %d %b, %Y]'

    oldest_article = 7
    max_articles_per_feed = 100
    use_embedded_content  = False
    recursion             = 10

    remove_javascript = True


    def parse_index(self):
        feeds = []

        for title, url in [
             ("Rassegna Nazionale", "http://rassegnastampa.mef.gov.it/mefnazionale/Default.aspx"),
             ("Rassegna Nazionale 2", "http://rassegnastampa.mef.gov.it/mefnazionale/")
            ]:

            soup = self.index_to_soup(url)

            articles = []

#Main Aperture 
            soup = soup.find(name='div', attr={'id':'results'})
            if soup:            
                article = soup.find('tbody')
                for article in soup.findAllNext('tr'):
                    article_first = article
                    tupla = article.find(attrs={'class':'TopicCellShort'})
                    title_url = self.tag_to_string(tupla)
                    tupla = article.find(attrs={'class':'PublicationCellShort'})
                    title_url += self.tag_to_string(tupla)
                    tupla = article.find(attrs={'class':'TitleCellShort'})
                    title_url += self.tag_to_string(tupla)

                    tupla = article.find(attrs={'class':'OcrLinkCellShort'})
                    link = tupla.get('href', False)

                    date = ''
                    description =  ''
                if title_url:
                   articles.append({'title': title_url, 'url': link,'description':description, 'date':date})
            if articles:
               feeds.append((title, articles))
        return feeds

05-13-2011, 04:37 AM	#3
gambarini Connoisseur Posts: 98 Karma: 22 Join Date: Mar 2010 Device: IRiver Story, Ipod Touch, Android SmartPhone	Hy Starson17 this is the entire recipe. my previous post is incorrect. I obtain the entire page, like you, and it's equal than the page obtained with "VIEW SOURCE". My problem is: i am not able to find anything in the page. i have tried various combinatio of attribute but with no results. PHP Code: #!/usr/bin/env python __license__ = 'GPL v3' __author__ = 'Lorenzo Vigentini, based on Darko Miletic, Gabriele Marini' __copyright__ = '2009, Darko Miletic <darko.miletic at gmail.com>, Lorenzo Vigentini <l.vigentini at gmail.com>' description = 'Italian daily newspaper - v1.01 (04, January 2010); 16.05.2010 new version' ''' http://rassegnastampa.mef.gov.it/mefnazionale/Default.aspx ''' from calibre.web.feeds.news import BasicNewsRecipe class RassegnaMefParseIndex(BasicNewsRecipe): author = 'Marini Gabriele' description = 'Rassegna Stampa MEV' cover_url = 'http://rassegnastampa.mef.gov.it/Mef/sorg_n/nazionale.jpg' title = u'Rassegna MEF' publisher = 'Ministero Economia e Finanze' category = 'News, politics, culture, economy, general interest' language = 'it' timefmt = '[%a, %d %b, %Y]' oldest_article = 7 max_articles_per_feed = 100 use_embedded_content = False recursion = 10 remove_javascript = True def parse_index(self): feeds = [] for title, url in [ ("Rassegna Nazionale", "http://rassegnastampa.mef.gov.it/mefnazionale/Default.aspx"), ("Rassegna Nazionale 2", "http://rassegnastampa.mef.gov.it/mefnazionale/") ]: soup = self.index_to_soup(url) articles = [] #Main Aperture soup = soup.find(name='div', attr={'id':'results'}) if soup: article = soup.find('tbody') for article in soup.findAllNext('tr'): article_first = article tupla = article.find(attrs={'class':'TopicCellShort'}) title_url = self.tag_to_string(tupla) tupla = article.find(attrs={'class':'PublicationCellShort'}) title_url += self.tag_to_string(tupla) tupla = article.find(attrs={'class':'TitleCellShort'}) title_url += self.tag_to_string(tupla) tupla = article.find(attrs={'class':'OcrLinkCellShort'}) link = tupla.get('href', False) date = '' description = '' if title_url: articles.append({'title': title_url, 'url': link,'description':description, 'date':date}) if articles: feeds.append((title, articles)) return feeds Last edited by gambarini; 05-13-2011 at 06:09 AM.