MobileRead Forums - View Single Post - Article download fails if I use postprocess_html function

oecherprinte · 11-24-2013, 04:44 PM

By the way: this is my complete recipe (download of recent decisions from European Patent Office). The nz_parse_section function is used in a very unconventional way but works. The problem is in the postprocess_html function.

Code:

from calibre.web.feeds.recipes import BasicNewsRecipe
from calibre.ebooks.BeautifulSoup import BeautifulSoup, NavigableString, CData, Tag
from contextlib import closing
from operator import itemgetter, attrgetter

import operator

import os, time, traceback, re, urlparse, sys, cStringIO, time, datetime, urllib
from collections import defaultdict
from functools import partial
from contextlib import nested, closing
from datetime import date, datetime, time


from calibre import browser, __appname__, iswindows, \
                    strftime, preferred_encoding
from calibre.ebooks.BeautifulSoup import BeautifulSoup, NavigableString, CData, Tag
from calibre.ebooks.metadata.opf2 import OPFCreator
from calibre import entity_to_unicode
from calibre.web import Recipe
from calibre.ebooks.metadata.toc import TOC
from calibre.ebooks.metadata import MetaInformation
from calibre.web.feeds import feed_from_xml, templates, feeds_from_index, Feed
from calibre.web.fetch.simple import option_parser as web2disk_option_parser
#from calibre.web.fetch.simple import RecursiveFetcherg
from calibre.utils.threadpool import WorkRequest, ThreadPool, NoResultsPending
from calibre.ptempfile import PersistentTemporaryFile
from calibre.utils.date import now as nowf
from calibre.utils.magick.draw import save_cover_data_to, add_borders_to_image

import sys
from PyQt4 import QtGui
from PyQt4 import QtCore

class decisions(BasicNewsRecipe):

    #####################
    # URL to start from
    #####################
    start_url="http://www.epo.org/law-practice/case-law-appeals/recent.html"
 
    __author__  = 'Me myself and I'
    description = 'Recent Decisions of the Boards of Appeal of the EPO'
    timefmt = ''
    language = 'en_GB'
    publisher              = u'European Patent Office'

    max_articles_per_feed = 1000
    #today=datetime.today()
    delay=1
    
    #####################    
    #get date of first and last decision from start_url
    #####################
    date_soup_file=urllib.urlopen(start_url)
    #print date_soup_file
    date_soup = BeautifulSoup(date_soup_file)
    date_list = date_soup.findAll(text=re.compile("Online on"))
    #print ('\t\t--------- search result array length: ' + str(len(date_list)))
    first_date = datetime.strptime((date_list[0].next.string),"%d.%m.%Y")
    last_date = datetime.strptime((date_list[len(date_list)-1].next.string),"%d.%m.%Y")
    #print first_date + ' ' + last_date
   
    #latest_date = date_list[1].next.string
    #log('\t\t--------- latest_date: ' + latest_date)
    
    
    #oldest_date=datetime.strptime("01.11.2013","%d.%m.%Y")

    #####################
    # dialog to get date
    #####################
    
    
    #create gui application for dialog (I think you do not need it it's a leftover)	    
    #app = QtGui.QApplication(sys.argv)

    #get widget
    #widget = QtGui.QWidget()
	    	    
    #open dialog to input integer variables
    #tmp_no_feeds, ok=QtGui.QInputDialog.getInteger(widget, 'Input Dialog - Number of Feeds', 'Number of feeds:',8)

    #this would be the text input dialog
    #tmp_date, ok=QtGui.QInputDialog.getText(widget, 'Input Dialog - Publication date of oldest decision', 'Enter date:',QtGui.QLineEdit.Normal,'01.01.1900')

    #take value if ok was pressed otherwise use default value
    #if ok:
    #  oldest_date=datetime.strptime(str(tmp_date),"%d.%m.%Y")
    #else:
    #  default
    #  oldest_date=datetime.strptime("01.01.1900","%d.%m.%Y")
	    
    #you may now use the variable no_feeds
    #log('\t\toldest date=',oldest_date.strftime("%d.%m.%Y")) 
	     
    ###########################
    # end dialog
    ###########################


    title = "Recent BoA Decisions " + last_date.strftime("%d.%m.%y") + " - " + first_date.strftime("%d.%m.%y")
    print '===================== ' + title + ' ====================='
    
    #BeautifulSoup.MARKUP_MASSAGE=('<!#BeginList>','')

    conversion_options = {
  	'linearize_tables' : True,
}

    #no_stylesheets = True
    
    #insert links to decisions
    
    #replace existing links in header
    #postprocess_regexps = [
    #((re.compile("[gtjr][0-9]+.html")),lambda x: "#" + str(re.match("[gtjr][0-9]+",x)))]
    
    remove_tags_before = dict(name='h1')
    #remove_tags_after  = dict(name='div', attrs={'class':'callToAction'})
    remove_tags = [
       dict(name='style'),
       dict(name='script'),
       #dict(name='div', attrs={'class':['sectionHeader', 'tools','callToAction', 'contentContainer right two nopad relatedColumn']}),      
       dict(name='div', attrs={'id':['epoHeader','siteMenu']}),
       dict(name='div', attrs={'id':['epoContentLeft','siteMenu']}),
#       dict(name='div', attrs={'id':['rHSBTitle','rHSBSection']}),
       dict(name='div', attrs={'style':['margin-top: 1em ; margin-bottom: 1em ; border: 1px solid black ; padding: .5em']}),
       #dict(name='form', attrs={'onsubmit':"return verifySearch(this.w,'Keyword, citation, or #author')"}),
       #dict(name='table', attrs={'cellspacing':'0'}),
    ]

    #my own function to remove erroneous strings before conversion into beautiful soup
    def my_index_to_soup(self, url_or_raw, raw=False):

        if re.match(r'\w+://', url_or_raw):
            open_func = getattr(self.browser, 'open_novisit', self.browser.open)
            with closing(open_func(url_or_raw)) as f:
                _raw = f.read()
            if not _raw:
                raise RuntimeError('Could not fetch index from %s'%url_or_raw)
        else:
            _raw = url_or_raw
        if raw:
            return _raw
        if not isinstance(_raw, unicode) and self.encoding:
            if callable(self.encoding):
                _raw = self.encoding(_raw)
            else:
                _raw = _raw.decode(self.encoding, 'replace')
        massage = list(BeautifulSoup.MARKUP_MASSAGE)
        enc = 'cp1252' if callable(self.encoding) or self.encoding is None else self.encoding
        massage.append((re.compile(r'&(\S+?);'), lambda match:
            entity_to_unicode(match, encoding=enc)))

        #remove erroneous strings from input file
        massage.append((re.compile("<!#BeginList>"), lambda match:''))
        massage.append((re.compile("<!#EndList>"), lambda match:''))
        
        return BeautifulSoup(_raw, markupMassage=massage)
    

    #TO GET ARTICLES IN SECTION
    def nz_parse_section(self, url):

        #global oldest_date
        #global start_url
        
        #initialize article lists for single pass
        articles_avm = []
        articles_ele = []
        articles_phy = []
        articles_che = []
        articles_mec = []
        articles_pro = []
        articles_j = []
        articles_g = []
        articles_r = []
        articles_cat = []
        articles_exc = []
        
        #cited decisions
        cited_decisions = [] 
        cited_decisions_feed = []
        
        #initialize feeds
        feeds = []
        cited_decisions = []
        
        #current_articles = []
        filter=url

        #url parameter is misused as a filter, url is hard coded here
        self.log('\t---- starting single pass scan')
        soup = self.index_to_soup(self.start_url)
        #div = soup.find(attrs={'class':'col-300 categoryList'})
        #date = div.find(attrs={'class':'link-list-heading'})

        #go over all tables of class application
        for i_month in soup.findAll('table',attrs={"class" : "application"}):
           
           self.log('\t---- month loop: ' + self.tag_to_string(i_month.previous.previous.previous))
           
           row_count = 0;
     
           #go over all rows within an application table
           for i_row in i_month.next.next.next.findNextSiblings('tr'):
             
             if (len(cited_decisions)>20):
              break;
             
             #self.log('\t\t-------- row loop' + str(row_count))
             
             #extract all information about a decision
             
             #title row - extract case and url
             if (row_count == 0):
                tmp_case_title = self.tag_to_string(i_row) 
                tmp_case_split = tmp_case_title.split(" ");
                tmp_case=tmp_case_split[1] + tmp_case_split[2];
                #self.log('\t\t--------- case: ' + tmp_case)
                
                tmp_url="http://www.epo.org" + i_row.findNext('a').findNext('a').get('href',None)
                #self.log('\t\t--------- url: ' + tmp_url)
                
             #second row: board, date, language, classes, application   
             elif (row_count == 1):
                cells = i_row.findAllNext('td',limit=6)
                
                #board
                tmp_board=self.tag_to_string(cells[1])
                tmp_board=tmp_board[5:len(tmp_board)]
                #self.log('\t\t--------- board: ' + tmp_board)

                #pub_date
                tmp_date=self.tag_to_string(cells[0])
                tmp_date=tmp_date[9:len(tmp_date)]
                #self.log('\t\t--------- date: ' + tmp_date)
                 
                 #language
                tmp_language=self.tag_to_string(cells[3])
                tmp_language=tmp_language[14:len(tmp_language)]
                #self.log('\t\t--------- language: ' + tmp_language)

                #classes - only the first class
                #tmp_classes=self.tag_to_string(cells[4])
                #tmp_classes=tmp_classes[3:min(13,len(tmp_classes))]
                #tmp_classes=tmp_classes.replace(' ','')
                #self.log('\t\t--------- classes: ' + tmp_classes)

             elif (row_count == 2):
             	cells = i_row.findAllNext('td',limit=3)
                
                #keywords
                tmp_keywords=self.tag_to_string(cells[1])
                tmp_keywords=tmp_board[9:len(tmp_keywords)]
                #self.log('\t\t--------- keywords: ' + tmp_keywords)
  
             #elif (row_count == 3):
             	        
             row_count=row_count+1;
             
             #new decision coming up
             if (row_count>3):
               row_count=0;

               #get additional data from decision page
               article_soup=self.index_to_soup(tmp_url)
               
               #get title
               tmp_title=self.tag_to_string(article_soup.find("meta",attrs={"name" : "dg3TLE"}).get('content',None))
               #self.log('\t\t--------- title: ' + tmp_title)
 
               #Articles used in decision
               tmp_articles=self.tag_to_string(article_soup.find("meta",attrs={"name" : "dg3ArtRef"}).get('content',None))
               if (tmp_articles != ""):
               	  tmp_articles = "A" + tmp_articles.replace(", "," A")
               #self.log('\t\t--------- articles: ' + tmp_articles)
               
               #Rules used in decision
               tmp_rules=self.tag_to_string(article_soup.find("meta",attrs={"name" : "dg3RuleRef"}).get('content',None))
               if (tmp_rules != ""):
               	  tmp_rules = "R" + tmp_rules.replace(", "," R")
               #self.log('\t\t--------- rules: ' + tmp_rules)
               
               #articles and rules in tmp_epc
               tmp_epc = tmp_articles
	       if (tmp_rules != ""):
                 tmp_epc = tmp_epc + " " + tmp_rules
               #self.log('\t\t--------- epc: ' + tmp_epc)
               
               #get classes from metadata
               tmp_all_classes=self.tag_to_string(article_soup.find("meta",attrs={"name" : "dg3CaseIPC"}).get('content',None))
               tmp_all_classes = tmp_all_classes.replace(" ","")
               tmp_classes = (tmp_all_classes.split(","))[0] #get first class for title
               #self.log('\t\t--------- all_classes: ' + tmp_all_classes + ' first class: ' + tmp_classes)
                              
               #check if catchwords are present          
               if (tmp_language =="EN"):
                 tmp_catchword=article_soup.find(text=re.compile("Catchwords"))
               elif (tmp_language == "DE"):  
                 tmp_catchword=article_soup.find(text=re.compile("Orientierungssatz"))
               elif (tmp_language == "FR"):
                 tmp_catchword=article_soup.find(text=re.compile("Exergue"))
                 
               if (tmp_catchword != None):
                tmp_catchword=self.tag_to_string(tmp_catchword.next.next)
                #self.log('\t\t--------- case:' + tmp_case + ' catchword: *' + tmp_catchword + '*')

                if (tmp_catchword != ' - '):
                  is_catchword = True
                else:
         	 is_catchword = False
               else: 
                 is_catchword = False
              
               #check for cited decisions
               tmp_cited_decisions=self.tag_to_string(article_soup.find("meta",attrs={"name" : "dg3aDCI"}).get('content',None))
               #go over all cited decisions
               if (tmp_cited_decisions != ''):
               	   self.log('\t\t no of cited decisions: ' + str(len(tmp_cited_decisions.split(","))))
               	   tmp_cited_decisions = tmp_cited_decisions.replace(' ','')
               	   for cit_dec in tmp_cited_decisions.split(","):
               	      #check if decision already exists in                	      
               	      if not any((cited_decision['id']==cit_dec) for cited_decision in cited_decisions):
               	      	      
               	      	      #determine url
               	      	      tmp_cit_url = cit_dec[1:len(cit_dec)].split("/")
               	      	      tmp_cit_url = tmp_cit_url[1] + tmp_cit_url[0]
               	      	      #self.log('\t\t tmp_cit_url: ' + tmp_cit_url)
               	      	      tmp_cit_url = article_soup.find('a',href=re.compile(tmp_cit_url))
               	      	      
			      if (tmp_cit_url != None):       
			      	 tmp_cit_url = self.tag_to_string(tmp_cit_url.get('href'))
               	      	         tmp_cit_url = "http://www.epo.org/law-practice/case-law-appeals/recent/" + str(tmp_cit_url)
               	      	      
               	      	         #add to list of cited decisions
               	      	         cited_decisions.append({'id': cit_dec,'url': tmp_cit_url, 'num_citings' : 1, 'citings': tmp_case})
               	      	         self.log('\t\t added cited decision: ' + cit_dec + ' [' + str(len(cited_decisions)) + '] ' + tmp_cit_url)
               	      	      
               	      #already exists	      
                      else:
                      	      for tmp_el in cited_decisions:
                      	      	  if (tmp_el['id'] == cit_dec):
                      	      	     tmp_el['num_citings'] += 1
                      	      	     tmp_el['citings'] += ' ' + tmp_case
                                     self.log('\t\t existing citation: ' + cit_dec + ' [' + str(tmp_el['num_citings']) + ']: ' + tmp_el['citings']  )
                                     	
              
               #check if Article 52(2) is mentioned in decision
               tmp_exception=article_soup.find(text=re.compile("52[13,()]+2"))
               if (tmp_exception == None):
               	 is_EXC = False
               else:
               	 is_EXC = True      
              
               #filter decisions
               is_G=(re.search("G",tmp_case))
               is_R=(re.search("R",tmp_case))
               is_J=(re.search("J",tmp_case))
               
               is_TAVM=((re.search("H04N",tmp_all_classes) or re.search("G11B27",tmp_classes)))
               is_TEL=((re.search("3.5.",tmp_board)))
               is_TPH=((re.search("3.4",tmp_board)))
               is_TCH=((re.search("3.3.",tmp_board)))
               is_TME=((re.search("3.2",tmp_board)))
               is_TPROC=(((re.search("A113",tmp_epc)) or (re.search("A116",tmp_epc)) or (re.search("R137",tmp_epc))) and re.search("T",tmp_case))
               is_CATCH=(is_catchword)
                                           
               article_header = tmp_classes + " - " + tmp_case + " - " + "[" + tmp_epc + "]" + " " + tmp_title[:45] + " (" + tmp_date + ")"
               self.log('\t\tFound decision:', article_header)
                 
               #sort article into arrays
               if (is_G):               	       
                 articles_g.append({'title': article_header , 'url':tmp_url, 'description': (tmp_keywords + " (" + tmp_date) + ")", 'date':tmp_date})
                 self.log('\t\t  ----> G(' + str(len(articles_g)) + ')')
                 
               if (is_EXC):               	       
                 articles_exc.append({'title': article_header , 'url':tmp_url, 'description': (tmp_keywords + " (" + tmp_date) + ")", 'date':tmp_date})
                 self.log('\t\t  ----> EXCEPTION(' + str(len(articles_exc)) + ')')
                 
               if (is_J):               	       
                 articles_j.append({'title': article_header , 'url':tmp_url, 'description': (tmp_keywords + " (" + tmp_date) + ")", 'date':tmp_date})
                 self.log('\t\t  ----> J(' + str(len(articles_j)) + ')')
                 
               if (is_TPROC):               	       
                 articles_pro.append({'title': article_header , 'url':tmp_url, 'description': (tmp_keywords + " (" + tmp_date) + ")", 'date':tmp_date})
                 self.log('\t\t  ----> PROC(' + str(len(articles_pro)) + ')')
                 
               if (is_R):               	       
                 articles_r.append({'title': article_header , 'url':tmp_url, 'description': (tmp_keywords + " (" + tmp_date) + ")", 'date':tmp_date})
                 self.log('\t\t  ----> R(' + str(len(articles_r)) + ')')
                 
               if (is_TAVM):               	       
                 articles_avm.append({'title': article_header , 'url':tmp_url, 'description': (tmp_keywords + " (" + tmp_date) + ")", 'date':tmp_date})
                 self.log('\t\t  ----> AVM(' + str(len(articles_avm)) + ')')
                 
               # if (is_TEL):               	       
                 # articles_ele.append({'title': article_header , 'url':tmp_url, 'description': (tmp_keywords + " (" + tmp_date) + ")", 'date':tmp_date})
                 # self.log('\t\t  ----> ELECTRONICS(' + str(len(articles_ele)) + ')')
                 # 
               # if (is_TPH):               	       
                 # articles_phy.append({'title': article_header , 'url':tmp_url, 'description': (tmp_keywords + " (" + tmp_date) + ")", 'date':tmp_date})
                 # self.log('\t\t  ----> PHYSYICS(' + str(len(articles_phy)) + ')')
                 # 
               # if (is_TCH):               	       
                 # articles_che.append({'title': article_header , 'url':tmp_url, 'description': (tmp_keywords + " (" + tmp_date) + ")", 'date':tmp_date})
                 # self.log('\t\t  ----> CHEMISTRY(' + str(len(articles_che)) + ')')
                 # 
               # if (is_TME):               	       
                 # articles_mec.append({'title': article_header , 'url':tmp_url, 'description': (tmp_keywords + " (" + tmp_date) + ")", 'date':tmp_date})
                 # self.log('\t\t  ----> MECHANICS(' + str(len(articles_mec)) + ')')
                 
               if (is_CATCH):               	       
                 articles_cat.append({'title': article_header , 'url':tmp_url, 'description': (tmp_keywords + " (" + tmp_date) + ")", 'date':tmp_date})
                 self.log('\t\t  ----> CATCHWORD(' + str(len(articles_cat)) + ')')
                 
                #self.log('==============================================')
               #input("Press Enter to continue...")   

        #end of loop => add all collected articles
        self.log('============== LOOP END ====================')

        if articles_avm:
          feeds.append(('AVM T Decisions', articles_avm))
          self.log('--> added ' + str(len(articles_avm))  + ' AVM T Decisions');
        
        if articles_ele:
          feeds.append(('Electricity T Decisions', articles_ele))
          self.log('--> added ' + str(len(articles_ele))  + ' Electricity T Decisions');
        
        if articles_phy:
          feeds.append(('Physics T Decisions', articles_phy))
          self.log('--> added ' + str(len(articles_phy))  + ' Physics T Decisions');
        
        if articles_che:
          feeds.append(('Chemistry T Decisions', articles_che))
          self.log('--> added ' + str(len(articles_che))  + ' Chemistry T Decisions');
        
        if articles_mec:
          feeds.append(('Mechanics T Decisions', articles_mec))
          self.log('--> added ' + str(len(articles_mec))  + ' Mechanics T Decisions');
        
        if articles_pro:
          feeds.append(('Procedural T Decisions', articles_pro))
          self.log('--> added ' + str(len(articles_pro))  + ' Procedural T Decisions');
          
        if articles_exc:
          feeds.append(('A52(2) T Decisions', articles_exc))
          self.log('--> added ' + str(len(articles_exc))  + ' A52(2) T Decisions');
        
        if articles_j:
          feeds.append(('J Decisions', articles_j))
          self.log('--> added ' + str(len(articles_j))  + ' J Decisions');
        
        if articles_g:
          feeds.append(('G Decisions', articles_g))
          self.log('--> added ' + str(len(articles_g))  + ' G Decisions');
        
        if articles_r:
          feeds.append(('R Decisions', articles_r))
          self.log('--> added ' + str(len(articles_r))  + ' R Decisions');
        
        if articles_cat:
          feeds.append(('Catchword Decisions', articles_cat))
          self.log('--> added ' + str(len(articles_cat))  + ' Catchword Decisions');       
        
        #sort cited decisions
        sorted_decisions = sorted(cited_decisions,key=lambda k: k['num_citings'],reverse=True)
        
        #add cited decisions
        for tmp_dec in sorted_decisions: 
        	
        	cited_decisions_feed.append({'title': tmp_dec['id'] + ' [' + str(tmp_dec['num_citings']) + ']', 'url': tmp_dec['url'], 'description': tmp_dec['id'], 'date':tmp_date})
        	self.log(str(tmp_dec['num_citings'])  + ' ');
        	
        if cited_decisions_feed:
          feeds.append(('Cited Decisions', cited_decisions_feed))
          self.log('--> added ' + str(len(cited_decisions_feed))  + ' Cited decisions');       

        
        return feeds


    # To GET SECTIONS
    def parse_index(self):
        
            feeds = []
#            for title, url in [
#                ('AVM T Decisions',
#                 'TAVM'),
#                ('Electricity T Decisions',
#                 'TEL'),
#                ('Physics T Decisions',
#                 'TPH'),
#                ('Chemistry T Decisions',
#                 'TCH'),
#                ('Mechanics T Decisions',
#                 'TME'),
#                ('Procedural T Decisions',
#                 'TPROC'),
#                ('J Decisions',
#                 'J'),
#                ('G Decisions',
#                 'G'),
#                ('R Decisions',
#                 'R'), 
#                ('Catchword Decisions',
#            	 'CATCH'),	

 #            ]:
            feeds = self.nz_parse_section(self.start_url)
            return feeds
            
    def postprocess_html(self,soup,first):
    	     		     	     	 
    	 self.log('===== post process article');
         for tmp_link in soup.findAll('a',href=re.compile("[gtrj][0-9]+....html")):
           dummy=0;
           #self.log('\t\t ====== found link: ' + self.tag_to_string(tmp_link).get('href'))
    	   
    	 return soup