Thanks, I'll
check it out.
Meanwhile, this seems to work when using Amazon. I just need to ignore suggested books that sometimes show up in a book's details:
Code:
import re
import requests
from bs4 import BeautifulSoup
HEADERS = ({'User-Agent':'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/44.0.2403.157 Safari/537.36', 'Accept-Language': 'en-US, en;q=0.5'})
url = 'https://iww.amazon.com/s?i=stripbooks&rh=p_66%3A123'
pattern_url = re.compile("^(.+?)/ref=")
#suggested stuff
pattern_url_bad = re.compile("^/sspa/click")
#title: Ignore stuff after dash
pattern_title = re.compile("^(.+?) - ")
reqs = requests.get(url, headers=HEADERS)
soup = BeautifulSoup(reqs.text, 'lxml')
for div in soup("div", attrs={"class": 'puisg-col-inner'}):
results = div.find("a", attrs={"class": 'a-link-normal s-line-clamp-2 s-link-style a-text-normal'})
if results:
#sponsored stuff? ignore
m = pattern_url_bad.search(results["href"])
if m:
continue
m = pattern_url.search(results["href"])
if m:
url = f"https://www.amazon.com/{m.group(1)}"
print(url)
#TODO Also returns sponsored stuff in book's page sometimes?
reqs = requests.get(url, headers=HEADERS)
soup = BeautifulSoup(reqs.text, 'lxml')
title = soup.head.title.string
m = pattern_title.search(title)
if m:
print("Title:",m.group(1))
div = soup.find("div", attrs={"class": 'a-column a-span4 _follow-the-author-card_style_authorNameColumn__1YFry'})
if div:
print("Author:",div.text.strip())
print("==============")
---
Edit: It can be done with just one call, by just parsing the results and ignoring extra, sponsored hits
Code:
import requests
from bs4 import BeautifulSoup
from random import randint
from time import sleep
from datetime import datetime
from pathlib import Path
DIV_ATTRBS = {"class": 'a-section a-spacing-none puis-padding-right-small s-title-instructions-style',"data-cy":"title-recipe"}
KEY_AUTHOR = "a-size-base a-link-normal s-underline-text s-underline-link-text s-link-style"
HEADERS = ({'User-Agent':'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/44.0.2403.157 Safari/537.36', 'Accept-Language': 'en-US, en;q=0.5'})
INPUT = "test.isbn.txt"
OUTPUT = "test.isbn.out.txt"
pattern_url = re.compile("^(.+?)/ref")
with open(INPUT) as reader:
for isbn in reader:
isbn = isbn.rstrip('\n')
url = f"https://www.amazon.com/s?i=stripbooks&rh=p_66%3A{isbn}"
print(url)
try:
reqs = requests.get(url, headers=HEADERS)
except:
error = f"{datetime.now()} Failed downloading {url}"
print(error)
with open("error.txt", "a", encoding="utf-8") as text_file:
text_file.write(f"Error {datetime.now()}: {isbn}")
exit()
soup = BeautifulSoup(reqs.text, 'lxml')
for div in soup("div", attrs=DIV_ATTRBS):
#ignore sponsored stuff
if div.find("span", string="Sponsored"):
continue
#URL to book
m = pattern_url.search(div.a["href"])
if m:
url = f"https://www.amazon.com{m.group(1)}"
print("URL=",url)
#title
if div.h2:
print("Title:",div.h2.text)
else:
print("No title")
#author
if div.find("a",class_=KEY_AUTHOR):
#author = div.find("a",class_=KEY_AUTHOR).string
author = div.find("a",class_=KEY_AUTHOR).text
print("Author:",author)
else:
print("No author")
output = f"{isbn}\t{title}\t{author}\t{url}\n"
print(output)
with open(OUTPUT, "a", encoding="utf-8") as text_file:
text_file.write(output)
#will we get blocked at some point?
#sleep(randint(1,10))
print("Sleeping 15mn at ", datetime.now())
sleep(900) #15mn seems enough to avoid throttling