View Single Post
Old 03-30-2014, 08:42 PM   #2
Camper65
Enthusiast
Camper65 began at the beginning.
 
Posts: 30
Karma: 10
Join Date: Apr 2011
Device: Kindle wifi; Toshiba Thrive
I'm working on downloading the Personal Tech (Was Gadgetwise) area by itself as there seemed to be many changes. But how do I get rid of the button?

<button class="button comments-button theme-kicker" data-skip-to-para-id="">
</button>

I also ended up having trouble getting it to remove stuff at the end using remove_tags_after so ended up using a lot more remove tags entries.

Spoiler:
#!/usr/bin/env python
# encoding: utf-8

from __future__ import with_statement
__license__ = 'GPL 3'
__copyright__ = 'zotzo'
__docformat__ = 'restructuredtext en'
"""
New York Times Personal Tech only
"""

from calibre.web.feeds.news import BasicNewsRecipe

class NYTimesTechnology(BasicNewsRecipe):
title = 'New York Times Personal Tech'
language = 'en'
__author__ = 'David Pogue'
description = 'The latest in Personal technology'
publisher = 'The New York Times'
category = 'Technology'
oldest_article = 4
max_articles_per_feed = 100
no_stylesheets = True
language = 'en'
cover_url ='http://bit.ly/g0SKJT'
use_embedded_content = False

feeds = [
(u'Personal Tech', u'http://www.nytimes.com/services/xml/rss/nyt/PersonalTech.xml'),
]
# keep_only_tags = [dict(name='div', attrs={'id':'header'}),
# dict(name='h1'),
# dict(name='h2'),
# dict(name='div', attrs={'class':'entry-content'})]

# remove_tags_after = [dict(name='div', attrs={'class':['marginalia most-emailed-marginalia hidden nocontent robots-nocontent']})]
remove_tags_after = [dict(name='div', attrs={'id':'whats-next'})]
remove_tags_before = dict(name='div', attrs={'id':'navigation-edge'})

#remove_tags_after = dict(name='div', attrs={'class':'[page-footer]'})
#remove_tags_after = dict(name='footer', id=lambda x:not x)


remove_tags = [
dict(name='div', attrs={'class':'inside-story'}),
dict(name='div', attrs={'id':'XXL'}),
dict(name='div', attrs={'id':'MiddleRight'}),
dict(name='a', attrs={'href':'#story-continues-1'}),
dict(name='a', attrs={'class':'visually-hidden skip-to-text-link'}),
dict(name='span', attrs={'class':'sharetools-label'}),
dict(name='div', attrs={'class':'column'}),
dict(name='div', attrs={'class':'site-index'}),
dict(name='div', attrs={'class':'column last-column'}),
dict(name='div', attrs={'id':'site-index'}),
dict(name='div', attrs={'id':'page-footer'}),
dict(name='div', attrs={'id':'Top5'}),
dict(name='div', attrs={'class':'search-overlay'}),
dict(name='div', attrs={'class':'split-6-layout layout'}),
dict(name='div', attrs={'role':'navigation'}), #can't tell if this works right
dict(name='div', attrs={'id':'Inv1'}),
dict(name='div', attrs={'id':'Inv2'}),
dict(name='div', attrs={'id':'Inv3'}),
dict(name='div', attrs={'id':'ab1'}),
dict(name='div', attrs={'id':'ab2'}),
dict(name='div', attrs={'id':'ab3'}),
dict(name='div', attrs={'id':'Anchor'}),
dict(name='div', attrs={'id':'SponLink'}),
dict(name='h2', attrs={'class':'section-heading visually-hidden'}),
dict(name='h2', attrs={'class':'section-heading'}),
dict(name='h2', attrs={'class':'section-heading'}),
dict(id='page-footer'),
dict(role='contentinfo')

# dict(id='site-logo'),
# dict(id='site-member'),
]
Camper65 is offline   Reply With Quote