By the way: this is my complete recipe (download of recent decisions from European Patent Office). The nz_parse_section function is used in a very unconventional way but works. The problem is in the postprocess_html function.
Code:
from calibre.web.feeds.recipes import BasicNewsRecipe
from calibre.ebooks.BeautifulSoup import BeautifulSoup, NavigableString, CData, Tag
from contextlib import closing
from operator import itemgetter, attrgetter
import operator
import os, time, traceback, re, urlparse, sys, cStringIO, time, datetime, urllib
from collections import defaultdict
from functools import partial
from contextlib import nested, closing
from datetime import date, datetime, time
from calibre import browser, __appname__, iswindows, \
strftime, preferred_encoding
from calibre.ebooks.BeautifulSoup import BeautifulSoup, NavigableString, CData, Tag
from calibre.ebooks.metadata.opf2 import OPFCreator
from calibre import entity_to_unicode
from calibre.web import Recipe
from calibre.ebooks.metadata.toc import TOC
from calibre.ebooks.metadata import MetaInformation
from calibre.web.feeds import feed_from_xml, templates, feeds_from_index, Feed
from calibre.web.fetch.simple import option_parser as web2disk_option_parser
#from calibre.web.fetch.simple import RecursiveFetcherg
from calibre.utils.threadpool import WorkRequest, ThreadPool, NoResultsPending
from calibre.ptempfile import PersistentTemporaryFile
from calibre.utils.date import now as nowf
from calibre.utils.magick.draw import save_cover_data_to, add_borders_to_image
import sys
from PyQt4 import QtGui
from PyQt4 import QtCore
class decisions(BasicNewsRecipe):
#####################
# URL to start from
#####################
start_url="http://www.epo.org/law-practice/case-law-appeals/recent.html"
__author__ = 'Me myself and I'
description = 'Recent Decisions of the Boards of Appeal of the EPO'
timefmt = ''
language = 'en_GB'
publisher = u'European Patent Office'
max_articles_per_feed = 1000
#today=datetime.today()
delay=1
#####################
#get date of first and last decision from start_url
#####################
date_soup_file=urllib.urlopen(start_url)
#print date_soup_file
date_soup = BeautifulSoup(date_soup_file)
date_list = date_soup.findAll(text=re.compile("Online on"))
#print ('\t\t--------- search result array length: ' + str(len(date_list)))
first_date = datetime.strptime((date_list[0].next.string),"%d.%m.%Y")
last_date = datetime.strptime((date_list[len(date_list)-1].next.string),"%d.%m.%Y")
#print first_date + ' ' + last_date
#latest_date = date_list[1].next.string
#log('\t\t--------- latest_date: ' + latest_date)
#oldest_date=datetime.strptime("01.11.2013","%d.%m.%Y")
#####################
# dialog to get date
#####################
#create gui application for dialog (I think you do not need it it's a leftover)
#app = QtGui.QApplication(sys.argv)
#get widget
#widget = QtGui.QWidget()
#open dialog to input integer variables
#tmp_no_feeds, ok=QtGui.QInputDialog.getInteger(widget, 'Input Dialog - Number of Feeds', 'Number of feeds:',8)
#this would be the text input dialog
#tmp_date, ok=QtGui.QInputDialog.getText(widget, 'Input Dialog - Publication date of oldest decision', 'Enter date:',QtGui.QLineEdit.Normal,'01.01.1900')
#take value if ok was pressed otherwise use default value
#if ok:
# oldest_date=datetime.strptime(str(tmp_date),"%d.%m.%Y")
#else:
# default
# oldest_date=datetime.strptime("01.01.1900","%d.%m.%Y")
#you may now use the variable no_feeds
#log('\t\toldest date=',oldest_date.strftime("%d.%m.%Y"))
###########################
# end dialog
###########################
title = "Recent BoA Decisions " + last_date.strftime("%d.%m.%y") + " - " + first_date.strftime("%d.%m.%y")
print '===================== ' + title + ' ====================='
#BeautifulSoup.MARKUP_MASSAGE=('<!#BeginList>','')
conversion_options = {
'linearize_tables' : True,
}
#no_stylesheets = True
#insert links to decisions
#replace existing links in header
#postprocess_regexps = [
#((re.compile("[gtjr][0-9]+.html")),lambda x: "#" + str(re.match("[gtjr][0-9]+",x)))]
remove_tags_before = dict(name='h1')
#remove_tags_after = dict(name='div', attrs={'class':'callToAction'})
remove_tags = [
dict(name='style'),
dict(name='script'),
#dict(name='div', attrs={'class':['sectionHeader', 'tools','callToAction', 'contentContainer right two nopad relatedColumn']}),
dict(name='div', attrs={'id':['epoHeader','siteMenu']}),
dict(name='div', attrs={'id':['epoContentLeft','siteMenu']}),
# dict(name='div', attrs={'id':['rHSBTitle','rHSBSection']}),
dict(name='div', attrs={'style':['margin-top: 1em ; margin-bottom: 1em ; border: 1px solid black ; padding: .5em']}),
#dict(name='form', attrs={'onsubmit':"return verifySearch(this.w,'Keyword, citation, or #author')"}),
#dict(name='table', attrs={'cellspacing':'0'}),
]
#my own function to remove erroneous strings before conversion into beautiful soup
def my_index_to_soup(self, url_or_raw, raw=False):
if re.match(r'\w+://', url_or_raw):
open_func = getattr(self.browser, 'open_novisit', self.browser.open)
with closing(open_func(url_or_raw)) as f:
_raw = f.read()
if not _raw:
raise RuntimeError('Could not fetch index from %s'%url_or_raw)
else:
_raw = url_or_raw
if raw:
return _raw
if not isinstance(_raw, unicode) and self.encoding:
if callable(self.encoding):
_raw = self.encoding(_raw)
else:
_raw = _raw.decode(self.encoding, 'replace')
massage = list(BeautifulSoup.MARKUP_MASSAGE)
enc = 'cp1252' if callable(self.encoding) or self.encoding is None else self.encoding
massage.append((re.compile(r'&(\S+?);'), lambda match:
entity_to_unicode(match, encoding=enc)))
#remove erroneous strings from input file
massage.append((re.compile("<!#BeginList>"), lambda match:''))
massage.append((re.compile("<!#EndList>"), lambda match:''))
return BeautifulSoup(_raw, markupMassage=massage)
#TO GET ARTICLES IN SECTION
def nz_parse_section(self, url):
#global oldest_date
#global start_url
#initialize article lists for single pass
articles_avm = []
articles_ele = []
articles_phy = []
articles_che = []
articles_mec = []
articles_pro = []
articles_j = []
articles_g = []
articles_r = []
articles_cat = []
articles_exc = []
#cited decisions
cited_decisions = []
cited_decisions_feed = []
#initialize feeds
feeds = []
cited_decisions = []
#current_articles = []
filter=url
#url parameter is misused as a filter, url is hard coded here
self.log('\t---- starting single pass scan')
soup = self.index_to_soup(self.start_url)
#div = soup.find(attrs={'class':'col-300 categoryList'})
#date = div.find(attrs={'class':'link-list-heading'})
#go over all tables of class application
for i_month in soup.findAll('table',attrs={"class" : "application"}):
self.log('\t---- month loop: ' + self.tag_to_string(i_month.previous.previous.previous))
row_count = 0;
#go over all rows within an application table
for i_row in i_month.next.next.next.findNextSiblings('tr'):
if (len(cited_decisions)>20):
break;
#self.log('\t\t-------- row loop' + str(row_count))
#extract all information about a decision
#title row - extract case and url
if (row_count == 0):
tmp_case_title = self.tag_to_string(i_row)
tmp_case_split = tmp_case_title.split(" ");
tmp_case=tmp_case_split[1] + tmp_case_split[2];
#self.log('\t\t--------- case: ' + tmp_case)
tmp_url="http://www.epo.org" + i_row.findNext('a').findNext('a').get('href',None)
#self.log('\t\t--------- url: ' + tmp_url)
#second row: board, date, language, classes, application
elif (row_count == 1):
cells = i_row.findAllNext('td',limit=6)
#board
tmp_board=self.tag_to_string(cells[1])
tmp_board=tmp_board[5:len(tmp_board)]
#self.log('\t\t--------- board: ' + tmp_board)
#pub_date
tmp_date=self.tag_to_string(cells[0])
tmp_date=tmp_date[9:len(tmp_date)]
#self.log('\t\t--------- date: ' + tmp_date)
#language
tmp_language=self.tag_to_string(cells[3])
tmp_language=tmp_language[14:len(tmp_language)]
#self.log('\t\t--------- language: ' + tmp_language)
#classes - only the first class
#tmp_classes=self.tag_to_string(cells[4])
#tmp_classes=tmp_classes[3:min(13,len(tmp_classes))]
#tmp_classes=tmp_classes.replace(' ','')
#self.log('\t\t--------- classes: ' + tmp_classes)
elif (row_count == 2):
cells = i_row.findAllNext('td',limit=3)
#keywords
tmp_keywords=self.tag_to_string(cells[1])
tmp_keywords=tmp_board[9:len(tmp_keywords)]
#self.log('\t\t--------- keywords: ' + tmp_keywords)
#elif (row_count == 3):
row_count=row_count+1;
#new decision coming up
if (row_count>3):
row_count=0;
#get additional data from decision page
article_soup=self.index_to_soup(tmp_url)
#get title
tmp_title=self.tag_to_string(article_soup.find("meta",attrs={"name" : "dg3TLE"}).get('content',None))
#self.log('\t\t--------- title: ' + tmp_title)
#Articles used in decision
tmp_articles=self.tag_to_string(article_soup.find("meta",attrs={"name" : "dg3ArtRef"}).get('content',None))
if (tmp_articles != ""):
tmp_articles = "A" + tmp_articles.replace(", "," A")
#self.log('\t\t--------- articles: ' + tmp_articles)
#Rules used in decision
tmp_rules=self.tag_to_string(article_soup.find("meta",attrs={"name" : "dg3RuleRef"}).get('content',None))
if (tmp_rules != ""):
tmp_rules = "R" + tmp_rules.replace(", "," R")
#self.log('\t\t--------- rules: ' + tmp_rules)
#articles and rules in tmp_epc
tmp_epc = tmp_articles
if (tmp_rules != ""):
tmp_epc = tmp_epc + " " + tmp_rules
#self.log('\t\t--------- epc: ' + tmp_epc)
#get classes from metadata
tmp_all_classes=self.tag_to_string(article_soup.find("meta",attrs={"name" : "dg3CaseIPC"}).get('content',None))
tmp_all_classes = tmp_all_classes.replace(" ","")
tmp_classes = (tmp_all_classes.split(","))[0] #get first class for title
#self.log('\t\t--------- all_classes: ' + tmp_all_classes + ' first class: ' + tmp_classes)
#check if catchwords are present
if (tmp_language =="EN"):
tmp_catchword=article_soup.find(text=re.compile("Catchwords"))
elif (tmp_language == "DE"):
tmp_catchword=article_soup.find(text=re.compile("Orientierungssatz"))
elif (tmp_language == "FR"):
tmp_catchword=article_soup.find(text=re.compile("Exergue"))
if (tmp_catchword != None):
tmp_catchword=self.tag_to_string(tmp_catchword.next.next)
#self.log('\t\t--------- case:' + tmp_case + ' catchword: *' + tmp_catchword + '*')
if (tmp_catchword != ' - '):
is_catchword = True
else:
is_catchword = False
else:
is_catchword = False
#check for cited decisions
tmp_cited_decisions=self.tag_to_string(article_soup.find("meta",attrs={"name" : "dg3aDCI"}).get('content',None))
#go over all cited decisions
if (tmp_cited_decisions != ''):
self.log('\t\t no of cited decisions: ' + str(len(tmp_cited_decisions.split(","))))
tmp_cited_decisions = tmp_cited_decisions.replace(' ','')
for cit_dec in tmp_cited_decisions.split(","):
#check if decision already exists in
if not any((cited_decision['id']==cit_dec) for cited_decision in cited_decisions):
#determine url
tmp_cit_url = cit_dec[1:len(cit_dec)].split("/")
tmp_cit_url = tmp_cit_url[1] + tmp_cit_url[0]
#self.log('\t\t tmp_cit_url: ' + tmp_cit_url)
tmp_cit_url = article_soup.find('a',href=re.compile(tmp_cit_url))
if (tmp_cit_url != None):
tmp_cit_url = self.tag_to_string(tmp_cit_url.get('href'))
tmp_cit_url = "http://www.epo.org/law-practice/case-law-appeals/recent/" + str(tmp_cit_url)
#add to list of cited decisions
cited_decisions.append({'id': cit_dec,'url': tmp_cit_url, 'num_citings' : 1, 'citings': tmp_case})
self.log('\t\t added cited decision: ' + cit_dec + ' [' + str(len(cited_decisions)) + '] ' + tmp_cit_url)
#already exists
else:
for tmp_el in cited_decisions:
if (tmp_el['id'] == cit_dec):
tmp_el['num_citings'] += 1
tmp_el['citings'] += ' ' + tmp_case
self.log('\t\t existing citation: ' + cit_dec + ' [' + str(tmp_el['num_citings']) + ']: ' + tmp_el['citings'] )
#check if Article 52(2) is mentioned in decision
tmp_exception=article_soup.find(text=re.compile("52[13,()]+2"))
if (tmp_exception == None):
is_EXC = False
else:
is_EXC = True
#filter decisions
is_G=(re.search("G",tmp_case))
is_R=(re.search("R",tmp_case))
is_J=(re.search("J",tmp_case))
is_TAVM=((re.search("H04N",tmp_all_classes) or re.search("G11B27",tmp_classes)))
is_TEL=((re.search("3.5.",tmp_board)))
is_TPH=((re.search("3.4",tmp_board)))
is_TCH=((re.search("3.3.",tmp_board)))
is_TME=((re.search("3.2",tmp_board)))
is_TPROC=(((re.search("A113",tmp_epc)) or (re.search("A116",tmp_epc)) or (re.search("R137",tmp_epc))) and re.search("T",tmp_case))
is_CATCH=(is_catchword)
article_header = tmp_classes + " - " + tmp_case + " - " + "[" + tmp_epc + "]" + " " + tmp_title[:45] + " (" + tmp_date + ")"
self.log('\t\tFound decision:', article_header)
#sort article into arrays
if (is_G):
articles_g.append({'title': article_header , 'url':tmp_url, 'description': (tmp_keywords + " (" + tmp_date) + ")", 'date':tmp_date})
self.log('\t\t ----> G(' + str(len(articles_g)) + ')')
if (is_EXC):
articles_exc.append({'title': article_header , 'url':tmp_url, 'description': (tmp_keywords + " (" + tmp_date) + ")", 'date':tmp_date})
self.log('\t\t ----> EXCEPTION(' + str(len(articles_exc)) + ')')
if (is_J):
articles_j.append({'title': article_header , 'url':tmp_url, 'description': (tmp_keywords + " (" + tmp_date) + ")", 'date':tmp_date})
self.log('\t\t ----> J(' + str(len(articles_j)) + ')')
if (is_TPROC):
articles_pro.append({'title': article_header , 'url':tmp_url, 'description': (tmp_keywords + " (" + tmp_date) + ")", 'date':tmp_date})
self.log('\t\t ----> PROC(' + str(len(articles_pro)) + ')')
if (is_R):
articles_r.append({'title': article_header , 'url':tmp_url, 'description': (tmp_keywords + " (" + tmp_date) + ")", 'date':tmp_date})
self.log('\t\t ----> R(' + str(len(articles_r)) + ')')
if (is_TAVM):
articles_avm.append({'title': article_header , 'url':tmp_url, 'description': (tmp_keywords + " (" + tmp_date) + ")", 'date':tmp_date})
self.log('\t\t ----> AVM(' + str(len(articles_avm)) + ')')
# if (is_TEL):
# articles_ele.append({'title': article_header , 'url':tmp_url, 'description': (tmp_keywords + " (" + tmp_date) + ")", 'date':tmp_date})
# self.log('\t\t ----> ELECTRONICS(' + str(len(articles_ele)) + ')')
#
# if (is_TPH):
# articles_phy.append({'title': article_header , 'url':tmp_url, 'description': (tmp_keywords + " (" + tmp_date) + ")", 'date':tmp_date})
# self.log('\t\t ----> PHYSYICS(' + str(len(articles_phy)) + ')')
#
# if (is_TCH):
# articles_che.append({'title': article_header , 'url':tmp_url, 'description': (tmp_keywords + " (" + tmp_date) + ")", 'date':tmp_date})
# self.log('\t\t ----> CHEMISTRY(' + str(len(articles_che)) + ')')
#
# if (is_TME):
# articles_mec.append({'title': article_header , 'url':tmp_url, 'description': (tmp_keywords + " (" + tmp_date) + ")", 'date':tmp_date})
# self.log('\t\t ----> MECHANICS(' + str(len(articles_mec)) + ')')
if (is_CATCH):
articles_cat.append({'title': article_header , 'url':tmp_url, 'description': (tmp_keywords + " (" + tmp_date) + ")", 'date':tmp_date})
self.log('\t\t ----> CATCHWORD(' + str(len(articles_cat)) + ')')
#self.log('==============================================')
#input("Press Enter to continue...")
#end of loop => add all collected articles
self.log('============== LOOP END ====================')
if articles_avm:
feeds.append(('AVM T Decisions', articles_avm))
self.log('--> added ' + str(len(articles_avm)) + ' AVM T Decisions');
if articles_ele:
feeds.append(('Electricity T Decisions', articles_ele))
self.log('--> added ' + str(len(articles_ele)) + ' Electricity T Decisions');
if articles_phy:
feeds.append(('Physics T Decisions', articles_phy))
self.log('--> added ' + str(len(articles_phy)) + ' Physics T Decisions');
if articles_che:
feeds.append(('Chemistry T Decisions', articles_che))
self.log('--> added ' + str(len(articles_che)) + ' Chemistry T Decisions');
if articles_mec:
feeds.append(('Mechanics T Decisions', articles_mec))
self.log('--> added ' + str(len(articles_mec)) + ' Mechanics T Decisions');
if articles_pro:
feeds.append(('Procedural T Decisions', articles_pro))
self.log('--> added ' + str(len(articles_pro)) + ' Procedural T Decisions');
if articles_exc:
feeds.append(('A52(2) T Decisions', articles_exc))
self.log('--> added ' + str(len(articles_exc)) + ' A52(2) T Decisions');
if articles_j:
feeds.append(('J Decisions', articles_j))
self.log('--> added ' + str(len(articles_j)) + ' J Decisions');
if articles_g:
feeds.append(('G Decisions', articles_g))
self.log('--> added ' + str(len(articles_g)) + ' G Decisions');
if articles_r:
feeds.append(('R Decisions', articles_r))
self.log('--> added ' + str(len(articles_r)) + ' R Decisions');
if articles_cat:
feeds.append(('Catchword Decisions', articles_cat))
self.log('--> added ' + str(len(articles_cat)) + ' Catchword Decisions');
#sort cited decisions
sorted_decisions = sorted(cited_decisions,key=lambda k: k['num_citings'],reverse=True)
#add cited decisions
for tmp_dec in sorted_decisions:
cited_decisions_feed.append({'title': tmp_dec['id'] + ' [' + str(tmp_dec['num_citings']) + ']', 'url': tmp_dec['url'], 'description': tmp_dec['id'], 'date':tmp_date})
self.log(str(tmp_dec['num_citings']) + ' ');
if cited_decisions_feed:
feeds.append(('Cited Decisions', cited_decisions_feed))
self.log('--> added ' + str(len(cited_decisions_feed)) + ' Cited decisions');
return feeds
# To GET SECTIONS
def parse_index(self):
feeds = []
# for title, url in [
# ('AVM T Decisions',
# 'TAVM'),
# ('Electricity T Decisions',
# 'TEL'),
# ('Physics T Decisions',
# 'TPH'),
# ('Chemistry T Decisions',
# 'TCH'),
# ('Mechanics T Decisions',
# 'TME'),
# ('Procedural T Decisions',
# 'TPROC'),
# ('J Decisions',
# 'J'),
# ('G Decisions',
# 'G'),
# ('R Decisions',
# 'R'),
# ('Catchword Decisions',
# 'CATCH'),
# ]:
feeds = self.nz_parse_section(self.start_url)
return feeds
def postprocess_html(self,soup,first):
self.log('===== post process article');
for tmp_link in soup.findAll('a',href=re.compile("[gtrj][0-9]+....html")):
dummy=0;
#self.log('\t\t ====== found link: ' + self.tag_to_string(tmp_link).get('href'))
return soup