Thanks, I'll
check it out.
Meanwhile, this seems to work when using Amazon. I just need to ignore suggested books that sometimes show up in a book's details:
Code:
import re
import requests
from bs4 import BeautifulSoup
HEADERS = ({'User-Agent':'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/44.0.2403.157 Safari/537.36', 'Accept-Language': 'en-US, en;q=0.5'})
url = 'https://iww.amazon.com/s?i=stripbooks&rh=p_66%3A123'
pattern_url = re.compile("^(.+?)/ref=")
#suggested stuff
pattern_url_bad = re.compile("^/sspa/click")
#title: Ignore stuff after dash
pattern_title = re.compile("^(.+?) - ")
reqs = requests.get(url, headers=HEADERS)
soup = BeautifulSoup(reqs.text, 'lxml')
for div in soup("div", attrs={"class": 'puisg-col-inner'}):
results = div.find("a", attrs={"class": 'a-link-normal s-line-clamp-2 s-link-style a-text-normal'})
if results:
#sponsored stuff? ignore
m = pattern_url_bad.search(results["href"])
if m:
continue
m = pattern_url.search(results["href"])
if m:
url = f"https://www.amazon.com/{m.group(1)}"
print(url)
#TODO Also returns sponsored stuff in book's page sometimes?
reqs = requests.get(url, headers=HEADERS)
soup = BeautifulSoup(reqs.text, 'lxml')
title = soup.head.title.string
m = pattern_title.search(title)
if m:
print("Title:",m.group(1))
div = soup.find("div", attrs={"class": 'a-column a-span4 _follow-the-author-card_style_authorNameColumn__1YFry'})
if div:
print("Author:",div.text.strip())
print("==============")