Hey Starson17 trying to apply what you had showed me on field and streams yet still a little confused.
Trying to play around with that
http://www.laineygossip.com/ for the other user. I can get the other articles just fine using the methods that you showed me but I'm having trouble getting the ones that are not inside the <h2> tags. More specifically look at
http://www.laineygossip.com/
Notice how it has the date then it goes dear gossipers, blah blah blah
well my thoughts were to take and do this to get the those articles then append it to the array then do another for loop to get the other articles that follow a different criteria
here is what i'm having an issue with
Spoiler:
Code:
def make_links(self, url):
title = 'Temp'
current_articles = []
soup = self.index_to_soup(url)
print 'The soup is: ', soup
for t_item in soup.findAll('div', {"class":"leftcontent"}):
print 't_item is: ', t_item
title = t_item.h1.string
for content in t_item.findAll('div', {"class":"artIntroShort"}):
print 'The content is: ', content
art_text = t_item.p.string
print 'Art_text is :', art_text
link = t_item.find('a')
print 'The link is :', link
url = self.INDEX + link['href']
print 'The URL is :', url
current_articles.append({'title': title, 'url': url, 'description':'', 'date':''}) # append all this
the articles are contained in the div class=leftcontent and the title is inside a h1 tag there. then i figured since i was inside the leftcontent due to the for look then i would then take and do another findall for the artIntroShort then parse it for the url and the article text that is in the <p> tag.....
here is the whole code i have thus far
Spoiler:
Code:
from calibre.web.feeds.news import BasicNewsRecipe
from calibre.ebooks.BeautifulSoup import BeautifulSoup
class GOSSIPBLOG(BasicNewsRecipe):
title = 'Gossip'
__author__ = 'Tonythebookworm'
description = 'Gossip'
language = 'en'
no_stylesheets = True
publisher = 'Tonythebookworm'
category = 'gossip'
use_embedded_content= False
no_stylesheets = True
oldest_article = 24
remove_javascript = True
remove_empty_feeds = True
# masthead_url = ''
# cover_url = ''
# recursions = 0
max_articles_per_feed = 10
INDEX = 'http://www.laineygossip.com/'
#keep_only_tags = [dict(name='div', attrs={'class':['mainContent']})
# ]
#remove_tags = [dict(name='div', attrs={'id':['comments']})]
def parse_index(self):
feeds = []
for title, url in [
(u"Gossip", u"http://www.laineygossip.com/"),
]:
articles = self.make_links(url)
if articles:
feeds.append((title, articles))
return feeds
def make_links(self, url):
title = 'Temp'
current_articles = []
soup = self.index_to_soup(url)
print 'The soup is: ', soup
for t_item in soup.findAll('div', {"class":"leftcontent"}):
print 't_item is: ', t_item
title = t_item.h1.string
for content in t_item.findAll('div', {"class":"artIntroShort"}):
print 'The content is: ', content
art_text = t_item.p.string
print 'Art_text is :', art_text
link = t_item.find('a')
print 'The link is :', link
url = self.INDEX + link['href']
print 'The URL is :', url
current_articles.append({'title': title, 'url': url, 'description':'', 'date':''}) # append all this
#---------------- next section ---------------------------------
for item in soup.findAll('h2'):
print 'item2 is: ', item
link2 = item.find('a')
print 'the link2 is: ', link2
if link2:
url = self.INDEX + link2['href']
print 'the title2 is: ', title
print 'the url2 is: ', url
current_articles.append({'title': title, 'url': url, 'description':'', 'date':''}) # append all this
return current_articles
I know i'm close to getting this yet seem so far away.