View Single Post
Old 08-29-2025, 02:53 PM   #9
Shohreh
Addict
Shohreh ought to be getting tired of karma fortunes by now.Shohreh ought to be getting tired of karma fortunes by now.Shohreh ought to be getting tired of karma fortunes by now.Shohreh ought to be getting tired of karma fortunes by now.Shohreh ought to be getting tired of karma fortunes by now.Shohreh ought to be getting tired of karma fortunes by now.Shohreh ought to be getting tired of karma fortunes by now.Shohreh ought to be getting tired of karma fortunes by now.Shohreh ought to be getting tired of karma fortunes by now.Shohreh ought to be getting tired of karma fortunes by now.Shohreh ought to be getting tired of karma fortunes by now.
 
Posts: 221
Karma: 304158
Join Date: Jan 2016
Location: France
Device: none
Thanks, I'll check it out.

Meanwhile, this seems to work when using Amazon. I just need to ignore suggested books that sometimes show up in a book's details:

Code:
import re
import requests
from bs4 import BeautifulSoup

HEADERS = ({'User-Agent':'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/44.0.2403.157 Safari/537.36', 'Accept-Language': 'en-US, en;q=0.5'})

url = 'https://iww.amazon.com/s?i=stripbooks&rh=p_66%3A123'
pattern_url = re.compile("^(.+?)/ref=")
#suggested stuff
pattern_url_bad = re.compile("^/sspa/click")
#title: Ignore stuff after dash
pattern_title = re.compile("^(.+?) - ")

reqs = requests.get(url, headers=HEADERS)
soup = BeautifulSoup(reqs.text, 'lxml')

for div in soup("div", attrs={"class": 'puisg-col-inner'}):
	results = div.find("a", attrs={"class": 'a-link-normal s-line-clamp-2 s-link-style a-text-normal'})
	if results:
		#sponsored stuff? ignore
		m = pattern_url_bad.search(results["href"])
		if m:
			continue

		m = pattern_url.search(results["href"])
		if m:
			url = f"https://www.amazon.com/{m.group(1)}"
			print(url)
			
			#TODO Also returns sponsored stuff in book's page sometimes?
			reqs = requests.get(url, headers=HEADERS)
			soup = BeautifulSoup(reqs.text, 'lxml')

			title = soup.head.title.string
			m = pattern_title.search(title)
			if m:
				print("Title:",m.group(1))
				
			div = soup.find("div", attrs={"class": 'a-column a-span4 _follow-the-author-card_style_authorNameColumn__1YFry'})
			if div:
				print("Author:",div.text.strip())
			print("==============")
---
Edit: It can be done with just one call, by just parsing the results and ignoring extra, sponsored hits

Code:
import requests
from bs4 import BeautifulSoup
from random import randint
from time import sleep
from datetime import datetime
from pathlib import Path

DIV_ATTRBS = {"class": 'a-section a-spacing-none puis-padding-right-small s-title-instructions-style',"data-cy":"title-recipe"}
KEY_AUTHOR = "a-size-base a-link-normal s-underline-text s-underline-link-text s-link-style"
HEADERS = ({'User-Agent':'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/44.0.2403.157 Safari/537.36', 'Accept-Language': 'en-US, en;q=0.5'})
INPUT = "test.isbn.txt"
OUTPUT = "test.isbn.out.txt"

pattern_url = re.compile("^(.+?)/ref")

with open(INPUT) as reader:
	for isbn in reader:
		isbn = isbn.rstrip('\n')
		url = f"https://www.amazon.com/s?i=stripbooks&rh=p_66%3A{isbn}"
		print(url)

		try:
			reqs = requests.get(url, headers=HEADERS)
		except:
			error = f"{datetime.now()} Failed downloading {url}"
			print(error)
			with open("error.txt", "a", encoding="utf-8") as text_file:
				text_file.write(f"Error {datetime.now()}: {isbn}")
			exit()

		soup = BeautifulSoup(reqs.text, 'lxml')

		for div in soup("div", attrs=DIV_ATTRBS):
			#ignore sponsored stuff
			if div.find("span", string="Sponsored"):
				continue

			#URL to book
			m = pattern_url.search(div.a["href"])
			if m:
				url = f"https://www.amazon.com{m.group(1)}"
				print("URL=",url)

			#title
			if div.h2:
				print("Title:",div.h2.text)
			else:
				print("No title")
			
			#author
			if div.find("a",class_=KEY_AUTHOR):
				#author = div.find("a",class_=KEY_AUTHOR).string
				author = div.find("a",class_=KEY_AUTHOR).text
				print("Author:",author)
			else:
				print("No author")

			output = f"{isbn}\t{title}\t{author}\t{url}\n"
			print(output)
			with open(OUTPUT, "a", encoding="utf-8") as text_file:
				text_file.write(output)

		#will we get blocked at some point?
		#sleep(randint(1,10))
		print("Sleeping 15mn at ", datetime.now())
		sleep(900) #15mn seems enough to avoid throttling

Last edited by Shohreh; 09-03-2025 at 08:15 AM.
Shohreh is offline   Reply With Quote