I wrote a python script which downloads the pages and places them in separate folders by date. It creates an index file for each day. Then cleans the pages using Beautifulsoup.
Can anyone help to transform it into a recipe?
Here's the script
Code:
#!/bin/python
import datetime, os, urllib, re
from urllib import urlopen
from bs4 import BeautifulSoup
now = datetime.datetime.now() #Get today's date
os.chdir(os.environ['HOME']) #Go to home folder
Base_folder = r'Breviaire_%s-%s-%s' % (now.day, now.month, now.year) #All files will be stored in this date-stamped folder
if not os.path.exists(Base_folder): os.makedirs(Base_folder) #Create a folder with today's date
os.chdir(Base_folder) #Go to the freshly created folder
idx = (now.weekday() + 1) % 7 #Get the day of the week
Base_date = now + datetime.timedelta(7-idx) #Get this Sunday's date
next_date = Base_date
#Download the files for x days
for i in range(0, 4):
next_folder = r'%s-%s-%s' % (next_date.year, next_date.month, next_date.day)
if not os.path.exists(next_folder): os.makedirs(next_folder)
os.chdir(next_folder)
site_date = "%s/%s/%s" % (next_date.day, next_date.month, next_date.year)
next_link = "http://www.aelf.org/office-messe?desktop=1&date_my=%s" % (site_date)
urllib.urlretrieve(next_link, filename="0_Messe.html")
laudes_link = "http://www.aelf.org/office-laudes?desktop=1&date_my=%s" % (site_date)
urllib.urlretrieve(laudes_link, filename="1_Laudes.html")
lectures_link = "http://www.aelf.org/office-lectures?desktop=1&date_my=%s" % (site_date)
urllib.urlretrieve(lectures_link, filename="2_Lectures.html")
tierce_link = "http://www.aelf.org/office-tierce?desktop=1&date_my=%s" % (site_date)
urllib.urlretrieve(tierce_link, filename="3_Tierce.html")
sexte_link = "http://www.aelf.org/office-sexte?desktop=1&date_my=%s" % (site_date)
urllib.urlretrieve(sexte_link, filename="4_Sexte.html")
none_link = "http://www.aelf.org/office-none?desktop=1&date_my=%s" % (site_date)
urllib.urlretrieve(none_link, filename="5_None.html")
vepres_link = "http://www.aelf.org/office-vepres?desktop=1&date_my=%s" % (site_date)
urllib.urlretrieve(vepres_link, filename="6_Vepres.html")
complies_link = "http://www.aelf.org/office-complies?desktop=1&date_my=%s" % (site_date)
urllib.urlretrieve(complies_link, filename="7_Complies.html")
html_doc = urlopen(next_link).read()
#Extract ordo
soup = BeautifulSoup(html_doc)
ordo_text = soup.find("div", {"class": "bloc"})
text_file = open("index.html", "w")
for hidden in ordo_text.find_all(id='maBulle'):
hidden.decompose()
part1 = """
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
<html xmlns="http://www.w3.org/1999/xhtml">
<head>
<meta http-equiv="Content-Type" content="text/html; charset=utf-8" />
</head>
<body>
"""
part3 = """
<div><a href="0_Messe.html">Messe</a> |
<a href="1_Laudes.html">Laudes</a> |
<a href="2_Lectures.html">Lectures</a> |
<a href="3_Tierce.html">Tierce</a> |
<a href="4_Sexte.html">Sexte</a> |
<a href="5_None.html">None</a> |
<a href="6_Vepres.html">Vepres</a> |
<a href="7_Complies.html">Complies</a>
<br><br>
</div>
<div style="text-align: center;"><a href="../index.html">Retour</a></div></body>
</html>
"""
joined = "%s<h2>%s</h2>%s%s" % (part1, site_date, ordo_text, part3)
text_file.write(joined)
text_file.close()
#Clean pages
for filename in os.listdir('.'):
if re.match(r'\d.*', filename):
messy = open(filename, "r")
soup = BeautifulSoup(messy)
messy.close()
for remove in soup.find_all(attrs={'class':['clr', 'goTop', 'print_only', 'change_country', 'abonnement', 'current', 'bloc', 'degre', 'base']}):
remove.decompose()
for remove in soup.find_all(id=['copyright', 'bas', 'menuHorizontal', 'colonneDroite', 'colonneGauche', 'font-resize', 'print_link', 'titre']):
remove.decompose()
cleaned = str(soup)
output_file = open(filename, "w")
output_file.write(cleaned)
# Go to parent folder and add 1 day
os.chdir("..")
next_date = Base_date + datetime.timedelta(days=i)