11-24-2013, 03:37 PM | #1 |
Zealot
Posts: 115
Karma: 20
Join Date: Jul 2010
Device: Kindle3 3G, Kindle Paperwhite 2
|
Article download fails if I use postprocess_html function
Hi,
sorry to bug you with a probably stupid error. But here's my problem: I would like to postprocess the html code of the downloaded articles using this function (it should only deliver debugging messages for now): Code:
def postprocess_html(self,soup,first): self.log('===== post process article'); for tmp_link in soup.findAll('a',href=re.compile("[gtrj][0-9]+....html")): dummy=0; self.log('\t\t ====== found link: ' + self.tag_to_string(tmp_link).get('href')) return soup Very strange indeed. Does anybody have an idea where I am wrong? Thanks, Jens |
11-24-2013, 03:44 PM | #2 |
Zealot
Posts: 115
Karma: 20
Join Date: Jul 2010
Device: Kindle3 3G, Kindle Paperwhite 2
|
By the way: this is my complete recipe (download of recent decisions from European Patent Office). The nz_parse_section function is used in a very unconventional way but works. The problem is in the postprocess_html function.
Code:
from calibre.web.feeds.recipes import BasicNewsRecipe from calibre.ebooks.BeautifulSoup import BeautifulSoup, NavigableString, CData, Tag from contextlib import closing from operator import itemgetter, attrgetter import operator import os, time, traceback, re, urlparse, sys, cStringIO, time, datetime, urllib from collections import defaultdict from functools import partial from contextlib import nested, closing from datetime import date, datetime, time from calibre import browser, __appname__, iswindows, \ strftime, preferred_encoding from calibre.ebooks.BeautifulSoup import BeautifulSoup, NavigableString, CData, Tag from calibre.ebooks.metadata.opf2 import OPFCreator from calibre import entity_to_unicode from calibre.web import Recipe from calibre.ebooks.metadata.toc import TOC from calibre.ebooks.metadata import MetaInformation from calibre.web.feeds import feed_from_xml, templates, feeds_from_index, Feed from calibre.web.fetch.simple import option_parser as web2disk_option_parser #from calibre.web.fetch.simple import RecursiveFetcherg from calibre.utils.threadpool import WorkRequest, ThreadPool, NoResultsPending from calibre.ptempfile import PersistentTemporaryFile from calibre.utils.date import now as nowf from calibre.utils.magick.draw import save_cover_data_to, add_borders_to_image import sys from PyQt4 import QtGui from PyQt4 import QtCore class decisions(BasicNewsRecipe): ##################### # URL to start from ##################### start_url="http://www.epo.org/law-practice/case-law-appeals/recent.html" __author__ = 'Me myself and I' description = 'Recent Decisions of the Boards of Appeal of the EPO' timefmt = '' language = 'en_GB' publisher = u'European Patent Office' max_articles_per_feed = 1000 #today=datetime.today() delay=1 ##################### #get date of first and last decision from start_url ##################### date_soup_file=urllib.urlopen(start_url) #print date_soup_file date_soup = BeautifulSoup(date_soup_file) date_list = date_soup.findAll(text=re.compile("Online on")) #print ('\t\t--------- search result array length: ' + str(len(date_list))) first_date = datetime.strptime((date_list[0].next.string),"%d.%m.%Y") last_date = datetime.strptime((date_list[len(date_list)-1].next.string),"%d.%m.%Y") #print first_date + ' ' + last_date #latest_date = date_list[1].next.string #log('\t\t--------- latest_date: ' + latest_date) #oldest_date=datetime.strptime("01.11.2013","%d.%m.%Y") ##################### # dialog to get date ##################### #create gui application for dialog (I think you do not need it it's a leftover) #app = QtGui.QApplication(sys.argv) #get widget #widget = QtGui.QWidget() #open dialog to input integer variables #tmp_no_feeds, ok=QtGui.QInputDialog.getInteger(widget, 'Input Dialog - Number of Feeds', 'Number of feeds:',8) #this would be the text input dialog #tmp_date, ok=QtGui.QInputDialog.getText(widget, 'Input Dialog - Publication date of oldest decision', 'Enter date:',QtGui.QLineEdit.Normal,'01.01.1900') #take value if ok was pressed otherwise use default value #if ok: # oldest_date=datetime.strptime(str(tmp_date),"%d.%m.%Y") #else: # default # oldest_date=datetime.strptime("01.01.1900","%d.%m.%Y") #you may now use the variable no_feeds #log('\t\toldest date=',oldest_date.strftime("%d.%m.%Y")) ########################### # end dialog ########################### title = "Recent BoA Decisions " + last_date.strftime("%d.%m.%y") + " - " + first_date.strftime("%d.%m.%y") print '===================== ' + title + ' =====================' #BeautifulSoup.MARKUP_MASSAGE=('<!#BeginList>','') conversion_options = { 'linearize_tables' : True, } #no_stylesheets = True #insert links to decisions #replace existing links in header #postprocess_regexps = [ #((re.compile("[gtjr][0-9]+.html")),lambda x: "#" + str(re.match("[gtjr][0-9]+",x)))] remove_tags_before = dict(name='h1') #remove_tags_after = dict(name='div', attrs={'class':'callToAction'}) remove_tags = [ dict(name='style'), dict(name='script'), #dict(name='div', attrs={'class':['sectionHeader', 'tools','callToAction', 'contentContainer right two nopad relatedColumn']}), dict(name='div', attrs={'id':['epoHeader','siteMenu']}), dict(name='div', attrs={'id':['epoContentLeft','siteMenu']}), # dict(name='div', attrs={'id':['rHSBTitle','rHSBSection']}), dict(name='div', attrs={'style':['margin-top: 1em ; margin-bottom: 1em ; border: 1px solid black ; padding: .5em']}), #dict(name='form', attrs={'onsubmit':"return verifySearch(this.w,'Keyword, citation, or #author')"}), #dict(name='table', attrs={'cellspacing':'0'}), ] #my own function to remove erroneous strings before conversion into beautiful soup def my_index_to_soup(self, url_or_raw, raw=False): if re.match(r'\w+://', url_or_raw): open_func = getattr(self.browser, 'open_novisit', self.browser.open) with closing(open_func(url_or_raw)) as f: _raw = f.read() if not _raw: raise RuntimeError('Could not fetch index from %s'%url_or_raw) else: _raw = url_or_raw if raw: return _raw if not isinstance(_raw, unicode) and self.encoding: if callable(self.encoding): _raw = self.encoding(_raw) else: _raw = _raw.decode(self.encoding, 'replace') massage = list(BeautifulSoup.MARKUP_MASSAGE) enc = 'cp1252' if callable(self.encoding) or self.encoding is None else self.encoding massage.append((re.compile(r'&(\S+?);'), lambda match: entity_to_unicode(match, encoding=enc))) #remove erroneous strings from input file massage.append((re.compile("<!#BeginList>"), lambda match:'')) massage.append((re.compile("<!#EndList>"), lambda match:'')) return BeautifulSoup(_raw, markupMassage=massage) #TO GET ARTICLES IN SECTION def nz_parse_section(self, url): #global oldest_date #global start_url #initialize article lists for single pass articles_avm = [] articles_ele = [] articles_phy = [] articles_che = [] articles_mec = [] articles_pro = [] articles_j = [] articles_g = [] articles_r = [] articles_cat = [] articles_exc = [] #cited decisions cited_decisions = [] cited_decisions_feed = [] #initialize feeds feeds = [] cited_decisions = [] #current_articles = [] filter=url #url parameter is misused as a filter, url is hard coded here self.log('\t---- starting single pass scan') soup = self.index_to_soup(self.start_url) #div = soup.find(attrs={'class':'col-300 categoryList'}) #date = div.find(attrs={'class':'link-list-heading'}) #go over all tables of class application for i_month in soup.findAll('table',attrs={"class" : "application"}): self.log('\t---- month loop: ' + self.tag_to_string(i_month.previous.previous.previous)) row_count = 0; #go over all rows within an application table for i_row in i_month.next.next.next.findNextSiblings('tr'): if (len(cited_decisions)>20): break; #self.log('\t\t-------- row loop' + str(row_count)) #extract all information about a decision #title row - extract case and url if (row_count == 0): tmp_case_title = self.tag_to_string(i_row) tmp_case_split = tmp_case_title.split(" "); tmp_case=tmp_case_split[1] + tmp_case_split[2]; #self.log('\t\t--------- case: ' + tmp_case) tmp_url="http://www.epo.org" + i_row.findNext('a').findNext('a').get('href',None) #self.log('\t\t--------- url: ' + tmp_url) #second row: board, date, language, classes, application elif (row_count == 1): cells = i_row.findAllNext('td',limit=6) #board tmp_board=self.tag_to_string(cells[1]) tmp_board=tmp_board[5:len(tmp_board)] #self.log('\t\t--------- board: ' + tmp_board) #pub_date tmp_date=self.tag_to_string(cells[0]) tmp_date=tmp_date[9:len(tmp_date)] #self.log('\t\t--------- date: ' + tmp_date) #language tmp_language=self.tag_to_string(cells[3]) tmp_language=tmp_language[14:len(tmp_language)] #self.log('\t\t--------- language: ' + tmp_language) #classes - only the first class #tmp_classes=self.tag_to_string(cells[4]) #tmp_classes=tmp_classes[3:min(13,len(tmp_classes))] #tmp_classes=tmp_classes.replace(' ','') #self.log('\t\t--------- classes: ' + tmp_classes) elif (row_count == 2): cells = i_row.findAllNext('td',limit=3) #keywords tmp_keywords=self.tag_to_string(cells[1]) tmp_keywords=tmp_board[9:len(tmp_keywords)] #self.log('\t\t--------- keywords: ' + tmp_keywords) #elif (row_count == 3): row_count=row_count+1; #new decision coming up if (row_count>3): row_count=0; #get additional data from decision page article_soup=self.index_to_soup(tmp_url) #get title tmp_title=self.tag_to_string(article_soup.find("meta",attrs={"name" : "dg3TLE"}).get('content',None)) #self.log('\t\t--------- title: ' + tmp_title) #Articles used in decision tmp_articles=self.tag_to_string(article_soup.find("meta",attrs={"name" : "dg3ArtRef"}).get('content',None)) if (tmp_articles != ""): tmp_articles = "A" + tmp_articles.replace(", "," A") #self.log('\t\t--------- articles: ' + tmp_articles) #Rules used in decision tmp_rules=self.tag_to_string(article_soup.find("meta",attrs={"name" : "dg3RuleRef"}).get('content',None)) if (tmp_rules != ""): tmp_rules = "R" + tmp_rules.replace(", "," R") #self.log('\t\t--------- rules: ' + tmp_rules) #articles and rules in tmp_epc tmp_epc = tmp_articles if (tmp_rules != ""): tmp_epc = tmp_epc + " " + tmp_rules #self.log('\t\t--------- epc: ' + tmp_epc) #get classes from metadata tmp_all_classes=self.tag_to_string(article_soup.find("meta",attrs={"name" : "dg3CaseIPC"}).get('content',None)) tmp_all_classes = tmp_all_classes.replace(" ","") tmp_classes = (tmp_all_classes.split(","))[0] #get first class for title #self.log('\t\t--------- all_classes: ' + tmp_all_classes + ' first class: ' + tmp_classes) #check if catchwords are present if (tmp_language =="EN"): tmp_catchword=article_soup.find(text=re.compile("Catchwords")) elif (tmp_language == "DE"): tmp_catchword=article_soup.find(text=re.compile("Orientierungssatz")) elif (tmp_language == "FR"): tmp_catchword=article_soup.find(text=re.compile("Exergue")) if (tmp_catchword != None): tmp_catchword=self.tag_to_string(tmp_catchword.next.next) #self.log('\t\t--------- case:' + tmp_case + ' catchword: *' + tmp_catchword + '*') if (tmp_catchword != ' - '): is_catchword = True else: is_catchword = False else: is_catchword = False #check for cited decisions tmp_cited_decisions=self.tag_to_string(article_soup.find("meta",attrs={"name" : "dg3aDCI"}).get('content',None)) #go over all cited decisions if (tmp_cited_decisions != ''): self.log('\t\t no of cited decisions: ' + str(len(tmp_cited_decisions.split(",")))) tmp_cited_decisions = tmp_cited_decisions.replace(' ','') for cit_dec in tmp_cited_decisions.split(","): #check if decision already exists in if not any((cited_decision['id']==cit_dec) for cited_decision in cited_decisions): #determine url tmp_cit_url = cit_dec[1:len(cit_dec)].split("/") tmp_cit_url = tmp_cit_url[1] + tmp_cit_url[0] #self.log('\t\t tmp_cit_url: ' + tmp_cit_url) tmp_cit_url = article_soup.find('a',href=re.compile(tmp_cit_url)) if (tmp_cit_url != None): tmp_cit_url = self.tag_to_string(tmp_cit_url.get('href')) tmp_cit_url = "http://www.epo.org/law-practice/case-law-appeals/recent/" + str(tmp_cit_url) #add to list of cited decisions cited_decisions.append({'id': cit_dec,'url': tmp_cit_url, 'num_citings' : 1, 'citings': tmp_case}) self.log('\t\t added cited decision: ' + cit_dec + ' [' + str(len(cited_decisions)) + '] ' + tmp_cit_url) #already exists else: for tmp_el in cited_decisions: if (tmp_el['id'] == cit_dec): tmp_el['num_citings'] += 1 tmp_el['citings'] += ' ' + tmp_case self.log('\t\t existing citation: ' + cit_dec + ' [' + str(tmp_el['num_citings']) + ']: ' + tmp_el['citings'] ) #check if Article 52(2) is mentioned in decision tmp_exception=article_soup.find(text=re.compile("52[13,()]+2")) if (tmp_exception == None): is_EXC = False else: is_EXC = True #filter decisions is_G=(re.search("G",tmp_case)) is_R=(re.search("R",tmp_case)) is_J=(re.search("J",tmp_case)) is_TAVM=((re.search("H04N",tmp_all_classes) or re.search("G11B27",tmp_classes))) is_TEL=((re.search("3.5.",tmp_board))) is_TPH=((re.search("3.4",tmp_board))) is_TCH=((re.search("3.3.",tmp_board))) is_TME=((re.search("3.2",tmp_board))) is_TPROC=(((re.search("A113",tmp_epc)) or (re.search("A116",tmp_epc)) or (re.search("R137",tmp_epc))) and re.search("T",tmp_case)) is_CATCH=(is_catchword) article_header = tmp_classes + " - " + tmp_case + " - " + "[" + tmp_epc + "]" + " " + tmp_title[:45] + " (" + tmp_date + ")" self.log('\t\tFound decision:', article_header) #sort article into arrays if (is_G): articles_g.append({'title': article_header , 'url':tmp_url, 'description': (tmp_keywords + " (" + tmp_date) + ")", 'date':tmp_date}) self.log('\t\t ----> G(' + str(len(articles_g)) + ')') if (is_EXC): articles_exc.append({'title': article_header , 'url':tmp_url, 'description': (tmp_keywords + " (" + tmp_date) + ")", 'date':tmp_date}) self.log('\t\t ----> EXCEPTION(' + str(len(articles_exc)) + ')') if (is_J): articles_j.append({'title': article_header , 'url':tmp_url, 'description': (tmp_keywords + " (" + tmp_date) + ")", 'date':tmp_date}) self.log('\t\t ----> J(' + str(len(articles_j)) + ')') if (is_TPROC): articles_pro.append({'title': article_header , 'url':tmp_url, 'description': (tmp_keywords + " (" + tmp_date) + ")", 'date':tmp_date}) self.log('\t\t ----> PROC(' + str(len(articles_pro)) + ')') if (is_R): articles_r.append({'title': article_header , 'url':tmp_url, 'description': (tmp_keywords + " (" + tmp_date) + ")", 'date':tmp_date}) self.log('\t\t ----> R(' + str(len(articles_r)) + ')') if (is_TAVM): articles_avm.append({'title': article_header , 'url':tmp_url, 'description': (tmp_keywords + " (" + tmp_date) + ")", 'date':tmp_date}) self.log('\t\t ----> AVM(' + str(len(articles_avm)) + ')') # if (is_TEL): # articles_ele.append({'title': article_header , 'url':tmp_url, 'description': (tmp_keywords + " (" + tmp_date) + ")", 'date':tmp_date}) # self.log('\t\t ----> ELECTRONICS(' + str(len(articles_ele)) + ')') # # if (is_TPH): # articles_phy.append({'title': article_header , 'url':tmp_url, 'description': (tmp_keywords + " (" + tmp_date) + ")", 'date':tmp_date}) # self.log('\t\t ----> PHYSYICS(' + str(len(articles_phy)) + ')') # # if (is_TCH): # articles_che.append({'title': article_header , 'url':tmp_url, 'description': (tmp_keywords + " (" + tmp_date) + ")", 'date':tmp_date}) # self.log('\t\t ----> CHEMISTRY(' + str(len(articles_che)) + ')') # # if (is_TME): # articles_mec.append({'title': article_header , 'url':tmp_url, 'description': (tmp_keywords + " (" + tmp_date) + ")", 'date':tmp_date}) # self.log('\t\t ----> MECHANICS(' + str(len(articles_mec)) + ')') if (is_CATCH): articles_cat.append({'title': article_header , 'url':tmp_url, 'description': (tmp_keywords + " (" + tmp_date) + ")", 'date':tmp_date}) self.log('\t\t ----> CATCHWORD(' + str(len(articles_cat)) + ')') #self.log('==============================================') #input("Press Enter to continue...") #end of loop => add all collected articles self.log('============== LOOP END ====================') if articles_avm: feeds.append(('AVM T Decisions', articles_avm)) self.log('--> added ' + str(len(articles_avm)) + ' AVM T Decisions'); if articles_ele: feeds.append(('Electricity T Decisions', articles_ele)) self.log('--> added ' + str(len(articles_ele)) + ' Electricity T Decisions'); if articles_phy: feeds.append(('Physics T Decisions', articles_phy)) self.log('--> added ' + str(len(articles_phy)) + ' Physics T Decisions'); if articles_che: feeds.append(('Chemistry T Decisions', articles_che)) self.log('--> added ' + str(len(articles_che)) + ' Chemistry T Decisions'); if articles_mec: feeds.append(('Mechanics T Decisions', articles_mec)) self.log('--> added ' + str(len(articles_mec)) + ' Mechanics T Decisions'); if articles_pro: feeds.append(('Procedural T Decisions', articles_pro)) self.log('--> added ' + str(len(articles_pro)) + ' Procedural T Decisions'); if articles_exc: feeds.append(('A52(2) T Decisions', articles_exc)) self.log('--> added ' + str(len(articles_exc)) + ' A52(2) T Decisions'); if articles_j: feeds.append(('J Decisions', articles_j)) self.log('--> added ' + str(len(articles_j)) + ' J Decisions'); if articles_g: feeds.append(('G Decisions', articles_g)) self.log('--> added ' + str(len(articles_g)) + ' G Decisions'); if articles_r: feeds.append(('R Decisions', articles_r)) self.log('--> added ' + str(len(articles_r)) + ' R Decisions'); if articles_cat: feeds.append(('Catchword Decisions', articles_cat)) self.log('--> added ' + str(len(articles_cat)) + ' Catchword Decisions'); #sort cited decisions sorted_decisions = sorted(cited_decisions,key=lambda k: k['num_citings'],reverse=True) #add cited decisions for tmp_dec in sorted_decisions: cited_decisions_feed.append({'title': tmp_dec['id'] + ' [' + str(tmp_dec['num_citings']) + ']', 'url': tmp_dec['url'], 'description': tmp_dec['id'], 'date':tmp_date}) self.log(str(tmp_dec['num_citings']) + ' '); if cited_decisions_feed: feeds.append(('Cited Decisions', cited_decisions_feed)) self.log('--> added ' + str(len(cited_decisions_feed)) + ' Cited decisions'); return feeds # To GET SECTIONS def parse_index(self): feeds = [] # for title, url in [ # ('AVM T Decisions', # 'TAVM'), # ('Electricity T Decisions', # 'TEL'), # ('Physics T Decisions', # 'TPH'), # ('Chemistry T Decisions', # 'TCH'), # ('Mechanics T Decisions', # 'TME'), # ('Procedural T Decisions', # 'TPROC'), # ('J Decisions', # 'J'), # ('G Decisions', # 'G'), # ('R Decisions', # 'R'), # ('Catchword Decisions', # 'CATCH'), # ]: feeds = self.nz_parse_section(self.start_url) return feeds def postprocess_html(self,soup,first): self.log('===== post process article'); for tmp_link in soup.findAll('a',href=re.compile("[gtrj][0-9]+....html")): dummy=0; #self.log('\t\t ====== found link: ' + self.tag_to_string(tmp_link).get('href')) return soup |
Advert | |
|
11-24-2013, 10:50 PM | #3 |
creator of calibre
Posts: 43,930
Karma: 22669820
Join Date: Oct 2006
Location: Mumbai, India
Device: Various
|
Run your download with -vv then you will get a detailed traceback showing you what the problem is.
|
11-26-2013, 04:17 PM | #4 |
Zealot
Posts: 115
Karma: 20
Join Date: Jul 2010
Device: Kindle3 3G, Kindle Paperwhite 2
|
That did the trick. Thanks.
I discovered a stupid typo. |
|
Similar Threads | ||||
Thread | Thread Starter | Forum | Replies | Last Post |
Get article URL in postprocess_html | rmflight | Recipes | 5 | 11-29-2012 11:37 AM |
Metadata fails to download | jadedboi | Library Management | 8 | 05-09-2011 03:55 PM |
download metadata fails ..... | schuster | Calibre | 1 | 02-10-2011 11:19 AM |
Catalog Function Fails With Memory Error | Ambermonk | Library Management | 4 | 03-08-2010 01:10 PM |
Catalog Function Fails With Memory Error | Ambermonk | Calibre | 0 | 03-07-2010 08:54 PM |