from bs4 import BeautifulSoup
import requests
import pandas as pd
import time
import urllib3
import gspread
sa = gspread.service_account(filename="C:/Users/user/Documents/sheetsapi.json")
sh = sa.open("only74kotc")
sheet = sh.worksheet("only74kotc")
headers = {
'Accept-Encoding': 'gzip, deflate, sdch',
'Accept-Language': 'en-US,en;q=0.8',
'Upgrade-Insecure-Requests': '1',
'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
'Cache-Control': 'max-age=0',
'Connection': 'keep-alive',
}
http = urllib3.PoolManager()
start_row = 1
for i in range(start_row, 324):
try:
# Get the data from column 1, row i
url = sheet.cell(i, 1).value
try:
source = requests.get (url, allow_redirects=False, headers=headers)
soup = BeautifulSoup(source.text,'html.parser')
except:
print("waiting one min")
time.sleep(60)
source = requests.get (url, allow_redirects=False, headers=headers)
soup = BeautifulSoup(source.text,'html.parser')
title = soup.find('h1', class_="ProductTitle__product-title___3QMYH")
try:
maindiv = soup.find('div', class_="ProductDescription__description-content___A_qCZ")
first_strong = maindiv.find('strong')
if first_strong:
second_strong = first_strong.find_next('strong')
if second_strong:
if second_strong.text in ["Key Ingredients:", "Key Benefits:", "Directions For Use:", "Safety Information:"]:
first_strong_index = maindiv.contents.index(first_strong)
second_strong_index = maindiv.contents.index(second_strong)
result_html = maindiv.contents[first_strong_index + 1:second_strong_index]
if not any(tag.name == 'ul' or tag.name == 'li' for tag in result_html):
result_html = [i.text.strip() for i in result_html if i != ' ' and i != '\n']
result_html = ''.join(result_html)
result = "".join([str(content) for content in result_html])
finalresult = ""+first_strong.text+" "+result.replace("<br>", "").replace("<br/>", "")+""
try:
descriptionlen = len(finalresult.split())#Working
except:
descriptionlen = "Not Found"
print(finalresult)
key_benefits = maindiv.find("strong", text="Key Benefits:").find_next_sibling("ul").find_all("li")
next_header = maindiv.find("strong", text="Key Benefits:").find_next_sibling("strong").text
key_benefits = [benefit.text.strip() for benefit in key_benefits]
key_benefits_header = soup.find("strong", text="Key Benefits:")
if key_benefits_header:
uls = key_benefits_header.find_next_siblings("ul")
key_benefits = []
for ul in uls:
if ul.find_previous_sibling("strong", text=next_header):
break
key_benefits.extend([li.text.strip() for li in ul.find_all("li")])
word_count = sum([len(benefit.split()) for benefit in key_benefits])
print("Key Benefits:", key_benefits)
print("Description Coint:", descriptionlen)
print("Benefit Count:", word_count)
else:
print("Key Benefits not found")
sheet.update_cell(i, 5, descriptionlen)
sheet.update_cell(i, 6, word_count)
sheet.update_cell(i, 7, descriptionlen + word_count)
sheet.update_cell(i, 8, finalresult)
else:
maindiv = soup.find('div', class_="ProductDescription__description-content___A_qCZ")
tagb = "b"
first_strong = maindiv.find(tagb)
if first_strong:
second_strong = first_strong.find_next(tagb)
if second_strong:
if second_strong.text in ["Key Ingredients:", "Key Benefits:", "Directions For Use:", "Safety Information:"]:
first_strong_index = maindiv.contents.index(first_strong)
second_strong_index = maindiv.contents.index(second_strong)
result_html = maindiv.contents[first_strong_index + 1:second_strong_index]
if not any(tag.name == 'ul' or tag.name == 'li' for tag in result_html):
result_html = [i.text.strip() for i in result_html if i != ' ' and i != '\n']
result_html = ''.join(result_html)
result = "".join([str(content) for content in result_html])
finalresult = ""+first_strong.text+" "+result.replace("<br>", "").replace("<br/>", "")+""
try:
descriptionlen = len(finalresult.split())#Working
except:
descriptionlen = "Not Found"
print(finalresult)
key_benefits = maindiv.find(tagb, text="Key Benefits:").find_next_sibling("ul").find_all("li")
next_header = maindiv.find(tagb, text="Key Benefits:").find_next_sibling(tagb).text
key_benefits = [benefit.text.strip() for benefit in key_benefits]
key_benefits_header = soup.find(tagb, text="Key Benefits:")
if key_benefits_header:
uls = key_benefits_header.find_next_siblings("ul")
key_benefits = []
for ul in uls:
if ul.find_previous_sibling(tagb, text=next_header):
break
key_benefits.extend([li.text.strip() for li in ul.find_all("li")])
word_count = sum([len(benefit.split()) for benefit in key_benefits])
print("Key Benefits:", key_benefits)
print("Description Coint:", descriptionlen)
print("Benefit Count:", word_count)
else:
print("Key Benefits not found")
sheet.update_cell(i, 5, descriptionlen)
sheet.update_cell(i, 6, word_count)
sheet.update_cell(i, 7, descriptionlen + word_count)
sheet.update_cell(i, 8, finalresult)
else:
print("strong / b Not found")
# mysheet.to_csv("C:/Users/user/Documents/1mg-data2.csv", index=False)
print("done")
time.sleep(1)
except Exception as e:
print(f"Error at row {i}: {e}")
except Exception as e:
# Handle the error
print(f"Error at row {i}: {e}")
# Save the updated dataframe back to the CSV file