View Single Post
Old 06-21-2014, 11:19 AM   #3
entodoays
Zealot
entodoays will become famous soon enoughentodoays will become famous soon enoughentodoays will become famous soon enoughentodoays will become famous soon enoughentodoays will become famous soon enoughentodoays will become famous soon enoughentodoays will become famous soon enough
 
entodoays's Avatar
 
Posts: 144
Karma: 706
Join Date: Oct 2011
Device: Sony Reader PRS-T1
Working python script

I wrote a python script which downloads the pages and places them in separate folders by date. It creates an index file for each day. Then cleans the pages using Beautifulsoup.

Can anyone help to transform it into a recipe?

Here's the script

Code:
#!/bin/python
import datetime, os, urllib, re
from urllib import urlopen
from bs4 import BeautifulSoup
now = datetime.datetime.now() #Get today's date
os.chdir(os.environ['HOME']) #Go to home folder
Base_folder = r'Breviaire_%s-%s-%s' % (now.day, now.month, now.year) #All files will be stored in this date-stamped folder
if not os.path.exists(Base_folder): os.makedirs(Base_folder) #Create a folder with today's date
os.chdir(Base_folder) #Go to the freshly created folder
idx = (now.weekday() + 1) % 7 #Get the day of the week
Base_date = now + datetime.timedelta(7-idx) #Get this Sunday's date
next_date = Base_date
#Download the files for x days
for i in range(0, 4):
	next_folder = r'%s-%s-%s' % (next_date.year, next_date.month, next_date.day)
	if not os.path.exists(next_folder): os.makedirs(next_folder)
	os.chdir(next_folder)
	site_date = "%s/%s/%s" % (next_date.day, next_date.month, next_date.year)
	next_link = "http://www.aelf.org/office-messe?desktop=1&date_my=%s" % (site_date)
	urllib.urlretrieve(next_link, filename="0_Messe.html")
	laudes_link = "http://www.aelf.org/office-laudes?desktop=1&date_my=%s" % (site_date)
	urllib.urlretrieve(laudes_link, filename="1_Laudes.html")
	lectures_link = "http://www.aelf.org/office-lectures?desktop=1&date_my=%s" % (site_date)
	urllib.urlretrieve(lectures_link, filename="2_Lectures.html")
	tierce_link = "http://www.aelf.org/office-tierce?desktop=1&date_my=%s" % (site_date)
	urllib.urlretrieve(tierce_link, filename="3_Tierce.html")
	sexte_link = "http://www.aelf.org/office-sexte?desktop=1&date_my=%s" % (site_date)
	urllib.urlretrieve(sexte_link, filename="4_Sexte.html")
	none_link = "http://www.aelf.org/office-none?desktop=1&date_my=%s" % (site_date)
	urllib.urlretrieve(none_link, filename="5_None.html")
	vepres_link = "http://www.aelf.org/office-vepres?desktop=1&date_my=%s" % (site_date)
	urllib.urlretrieve(vepres_link, filename="6_Vepres.html")
	complies_link = "http://www.aelf.org/office-complies?desktop=1&date_my=%s" % (site_date)
	urllib.urlretrieve(complies_link, filename="7_Complies.html")
	html_doc = urlopen(next_link).read()
	#Extract ordo
	soup = BeautifulSoup(html_doc)
	ordo_text = soup.find("div", {"class": "bloc"})
	text_file = open("index.html", "w")
	for hidden in ordo_text.find_all(id='maBulle'):
		hidden.decompose()	
	part1 = """
	<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
	<html xmlns="http://www.w3.org/1999/xhtml">
	<head>
	<meta http-equiv="Content-Type" content="text/html; charset=utf-8" />
	</head>
	<body>
	"""
	part3 = """
	<div><a href="0_Messe.html">Messe</a>&nbsp;&nbsp;|&nbsp;&nbsp;
	<a href="1_Laudes.html">Laudes</a>&nbsp;&nbsp;|&nbsp;&nbsp;
	<a href="2_Lectures.html">Lectures</a>&nbsp;&nbsp;|&nbsp;&nbsp;
	<a href="3_Tierce.html">Tierce</a>&nbsp;&nbsp;|&nbsp;&nbsp;
	<a href="4_Sexte.html">Sexte</a>&nbsp;&nbsp;|&nbsp;&nbsp;
	<a href="5_None.html">None</a>&nbsp;&nbsp;|&nbsp;&nbsp;
	<a href="6_Vepres.html">Vepres</a>&nbsp;&nbsp;|&nbsp;&nbsp;
	<a href="7_Complies.html">Complies</a>
	<br><br>
	</div>
	<div style="text-align: center;"><a href="../index.html">Retour</a></div></body>
	</html>
	"""
	joined = "%s<h2>%s</h2>%s%s" % (part1, site_date, ordo_text, part3)
	text_file.write(joined)
	text_file.close()
	#Clean pages
	for filename in os.listdir('.'):
		if re.match(r'\d.*', filename):
			messy = open(filename, "r")
			soup = BeautifulSoup(messy)
			messy.close()
			for remove in soup.find_all(attrs={'class':['clr', 'goTop', 'print_only', 'change_country', 'abonnement', 'current', 'bloc', 'degre', 'base']}):
				remove.decompose()
			for remove in soup.find_all(id=['copyright', 'bas', 'menuHorizontal', 'colonneDroite', 'colonneGauche', 'font-resize', 'print_link', 'titre']):
				remove.decompose()
			cleaned = str(soup)
			output_file = open(filename, "w")
			output_file.write(cleaned)
	# Go to parent folder and add 1 day
	os.chdir("..")
	next_date = Base_date + datetime.timedelta(days=i)
entodoays is offline   Reply With Quote