View Single Post
Old 08-31-2010, 06:58 PM   #2582
TonytheBookworm
Addict
TonytheBookworm is on a distinguished road
 
TonytheBookworm's Avatar
 
Posts: 264
Karma: 62
Join Date: May 2010
Device: kindle 2, kindle 3, Kindle fire
Hey Starson17 trying to apply what you had showed me on field and streams yet still a little confused.
Trying to play around with that http://www.laineygossip.com/ for the other user. I can get the other articles just fine using the methods that you showed me but I'm having trouble getting the ones that are not inside the <h2> tags. More specifically look at http://www.laineygossip.com/
Notice how it has the date then it goes dear gossipers, blah blah blah

well my thoughts were to take and do this to get the those articles then append it to the array then do another for loop to get the other articles that follow a different criteria

here is what i'm having an issue with
Spoiler:

Code:
def make_links(self, url):
        title = 'Temp'
        current_articles = []
        soup = self.index_to_soup(url)
        print 'The soup is: ', soup
        for t_item in soup.findAll('div', {"class":"leftcontent"}):
		  print 't_item is: ', t_item
		  title = t_item.h1.string
		  for content in t_item.findAll('div',  {"class":"artIntroShort"}):
		     print 'The content is: ', content 
		     art_text = t_item.p.string
		     print 'Art_text is :', art_text
		     link = t_item.find('a')
		     print 'The link is :', link
		     url = self.INDEX + link['href']
		     print 'The URL is :', url
		  
			
		  current_articles.append({'title': title, 'url': url, 'description':'', 'date':''}) # append all this

the articles are contained in the div class=leftcontent and the title is inside a h1 tag there. then i figured since i was inside the leftcontent due to the for look then i would then take and do another findall for the artIntroShort then parse it for the url and the article text that is in the <p> tag.....


here is the whole code i have thus far

Spoiler:

Code:
from calibre.web.feeds.news import BasicNewsRecipe
from calibre.ebooks.BeautifulSoup import BeautifulSoup
class GOSSIPBLOG(BasicNewsRecipe):
    title      = 'Gossip'
    __author__ = 'Tonythebookworm'
    description = 'Gossip'
    language = 'en'
    no_stylesheets = True
    publisher           = 'Tonythebookworm'
    category            = 'gossip'
    use_embedded_content= False
    no_stylesheets      = True
    oldest_article      = 24
    remove_javascript   = True
    remove_empty_feeds  = True
    # masthead_url        = ''
    # cover_url           = ''
    # recursions          = 0
    max_articles_per_feed = 10
    INDEX = 'http://www.laineygossip.com/'
    #keep_only_tags     = [dict(name='div', attrs={'class':['mainContent']})
    #                      ]
    #remove_tags = [dict(name='div', attrs={'id':['comments']})]
    
    def parse_index(self):
        feeds = []
        for title, url in [
                            (u"Gossip", u"http://www.laineygossip.com/"),
                            
                             ]:
            articles = self.make_links(url)
            if articles:
                feeds.append((title, articles))
        return feeds
        
    def make_links(self, url):
        title = 'Temp'
        current_articles = []
        soup = self.index_to_soup(url)
        print 'The soup is: ', soup
        for t_item in soup.findAll('div', {"class":"leftcontent"}):
		  print 't_item is: ', t_item
		  title = t_item.h1.string
		  for content in t_item.findAll('div',  {"class":"artIntroShort"}):
		     print 'The content is: ', content 
		     art_text = t_item.p.string
		     print 'Art_text is :', art_text
		     link = t_item.find('a')
		     print 'The link is :', link
		     url = self.INDEX + link['href']
		     print 'The URL is :', url
		  
			
		  current_articles.append({'title': title, 'url': url, 'description':'', 'date':''}) # append all this
        
        #---------------- next section ---------------------------------
        for item in soup.findAll('h2'):
          print 'item2 is: ', item
          link2 = item.find('a')
          print 'the link2 is: ', link2
          if link2:
            url         = self.INDEX + link2['href']
            print 'the title2 is: ', title
            print 'the url2 is: ', url
            current_articles.append({'title': title, 'url': url, 'description':'', 'date':''}) # append all this
          
        return current_articles


I know i'm close to getting this yet seem so far away.
TonytheBookworm is offline