from calibre.web.feeds.news import BasicNewsRecipe
from calibre.ebooks.BeautifulSoup import BeautifulSoup
import re

class TechCrunchRecipe(BasicNewsRecipe):
    __license__  = 'GPL v3'
    __author__ = 'kwetal'
    language = 'en_US'
    version = 1

    title          = u'Tech Crunch'
    publisher = u'techcrunch.com'
    category = u'Technology news'
    description = u'A group-edited blog that profiles the companies, products and events defining and transforming the new web.'

    oldest_article = 7
    max_articles_per_feed = 100

    use_embedded_content = True

    feeds = [(u'Tech Crunch', u'http://feeds.feedburner.com/TechCrunch')]

    remove_tags = []
    remove_tags.append(dict(name = 'div', attrs = {'class': 'feedflare'}))

    conversion_options = {'comments': description, 'tags': category, 'language': 'en',
                          'publisher': publisher}

    extra_css = '''
                body {font-family:verdana,arial,helvetica,geneva,sans-serif; text-align: right;}
                a {text-decoration: none; color: blue;}
                '''
                
    def preprocess_html(self, soup):
        a = soup.find('a', attrs = {'href': re.compile('http://feedads.*')})
        if a:
            a.parent.extract()

        unwanted = ['http://www.crunchgear.com', 'http://www.crunchgear.com/',
                    'http://www.crunchbase.com', 'http://www.crunchbase.com/',
                    'http://www.crunchboard.com', 'http://www.crunchboard.com/',
                    'http://www.mobilecrunch.com', 'http://www.mobilecrunch.com/',
                    'http://www.crunchboard.com/jobs', 'http://www.crunchboard.com/jobs/']
        a = soup.find('a', attrs = {'href': unwanted})
        if a:
            a.parent.extract()

        return soup

