View Single Post
Old 01-28-2011, 11:24 AM   #1
sorcer
Junior Member
sorcer began at the beginning.
 
Posts: 5
Karma: 10
Join Date: Jan 2011
Device: Kindle 3 WIFI
Struggling with one website

Hello!

I have tried to fetch one Russian website - www.snob.ru with this code:



import re
from calibre.web.feeds.recipes import BasicNewsRecipe

class Snob(BasicNewsRecipe):
title = 'Snob'
__author__ = 'Me'
description = 'Business news from Russian posh magazine'
timemft = ' [%a, %d %b, %Y]'
needs_subscription = True
oldest_article = 21
max_articles_per_feed = 50
no_stylesheets = True
#delay = 1
use_embedded_content = False
encoding = 'utf8'
publisher = 'Snob Media'
category = 'news, Russia, world'
language = 'ru_RU'
publication_type = 'newsportal'
extra_css = ' body{ font-family: Verdana,Helvetica,Arial,sans-serif } .introduction{font-weight: bold} .story-feature{display: block; padding: 0; border: 1px solid; width: 40%; font-size: small} .story-feature h2{text-align: center; text-transform: uppercase} '
preprocess_regexps = [(re.compile(r'<!--.*?-->', re.DOTALL), lambda m: '')]
conversion_options = {
'comments' : description
,'tags' : category
,'language' : language
,'publisher' : publisher
,'linearize_tables': True
}


def get_browser(self):
br = BasicNewsRecipe.get_browser()
if self.username is not None and self.password is not None:
br.open('http://www.snob.ru/login')
br.select_form(name='auth-wrapper')
br['USERNAME'] = self.username
br['PASSWORD'] = self.password
br.submit()
return br

keep_only_tags = [
dict(name='div', attrs={'class':['layout-block-a layout-block']})
,dict(attrs={'class':['story-body','storybody']})
]

remove_tags = [
dict(name='div', attrs={'class':['story-feature related narrow', 'share-help', 'embedded-hyper', \
'story-feature wide ', 'story-feature narrow']})
,dict(name=['img'])
]

remove_attributes = ['width','height']

feeds = [
('Politics', 'http://www.snob.ru/rss/blog/927'),
('Business', 'http://www.snob.ru/rss/blog/420'),
('Science', 'http://www.snob.ru/rss/blog/171'),
('Children', 'http://www.snob.ru/rss/blog/70'),
('Food and Alcohol', 'http://www.snob.ru/rss/blog/173'),
('Health', 'http://www.snob.ru/rss/blog/174'),
('Culture', 'http://www.snob.ru/rss/blog/683'),
('How to live', 'http://www.snob.ru/rss/blog/170'),
('Sex', 'http://www.snob.ru/rss/blog/69'),
('Interview', 'http://www.snob.ru/rss/blog/805'),
('XX century', 'http://www.snob.ru/rss/blog/416'),
('Editorial', 'http://www.snob.ru/rss/blog/894'),
('Chichvarkin', 'http://www.snob.ru/rss/pblog/8503'),
]


The error I get with this code is about the string 'br.select_form(name='auth-wrapper')'. It says that form 'auth-wrapper' is not found. Does anyone have any ideas how can I authorize on www.snob.ru/login before downloading?

Many thanks in advance.
sorcer is offline   Reply With Quote