from bs4 import BeautifulSoup
import requests
import pandas as pd
from bs4 import Tag
import time
headers = {
'Accept-Encoding': 'gzip, deflate, sdch',
'Accept-Language': 'en-US,en;q=0.8',
'Upgrade-Insecure-Requests': '1',
'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
'Cache-Control': 'max-age=0',
'Connection': 'keep-alive',
}
url = "https://www.1mg.com/otc/small-wonder-admire-baby-feeding-bottle-small-green-otc499746?wpsrc=Google+Organic+Search"
source = requests.get (url, allow_redirects=False, headers=headers)
soup = BeautifulSoup(source.text,'html.parser')
title = soup.find('h1', class_="ProductTitle__product-title___3QMYH")
maindiv = soup.find('div', class_="ProductDescription__description-content___A_qCZ")
first_strong = maindiv.find('strong')
if first_strong is None:
first_strong = maindiv.find('b')
tag_used = 'b'
else:
tag_used = 'strong'
#------------------------------------USES
if first_strong:
second_strong = first_strong.find_next(tag_used).find_next(tag_used)
if second_strong:
if second_strong.text in ["Key Ingredients:", "Key Benefits:", "Directions For Use:", "Safety Information:", "Product Specifications and Features:",
"Key Ingredients", "Key Ingredient:", "Key Benefits", "Directions For Use", "Safety Information", "Product Specifications and Features",
"Brief Details:", "Brief Details", "Basic Details:", "Basic Details"]:
first_strong_index = maindiv.contents.index(first_strong)
second_strong_index = maindiv.contents.index(second_strong)
result_html = maindiv.contents[first_strong_index + 2:second_strong_index]
if any(tag.name == tag_used and "Uses" in tag.text for tag in result_html):#agar uses milega isko <b> ke ya <strong> ke baad to hi work krega ye
result_html = [item for item in result_html if not (isinstance(item, Tag) and 'Uses' in item.text)]
if not any(tag.name == 'ul' or tag.name == 'li' for tag in result_html):#Agar hamare result me ul and li tags honge to us case me error dega.
result_html = [i.text.strip() for i in result_html if i != ' ' and i != '\n']
result_html = ''.join(result_html)
result = "".join([str(content) for content in result_html])
finalresult = ""+result.replace("<br>", "").replace("<br/>", "")+""
try:
descriptionlen = len(finalresult.split())#Working
except:
descriptionlen = "Not Found"
print(finalresult)
print("-------------------")
#-------------------------------DESCRIPTION---------
if first_strong:
second_strong = first_strong.find_next(tag_used)
if second_strong:
if second_strong.text in ["Key Ingredients:", "Key Benefits:", "Directions For Use:", "Safety Information:", "Uses:", "Product Specifications and Features:",
"Key Ingredients", "Key Ingredient:", "Key Benefits", "Directions For Use", "Safety Information", "Uses", "Product Specifications and Features",
"Brief Details:", "Brief Details", "Basic Details:", "Basic Details",
"Product Specification and Features", "Product Specification and Features:"]:
first_strong_index = maindiv.contents.index(first_strong)
second_strong_index = maindiv.contents.index(second_strong)
result_html = maindiv.contents[first_strong_index + 1:second_strong_index]
if not any(tag.name == 'ul' or tag.name == 'li' for tag in result_html):
result_html = [i.text.strip() for i in result_html if i != ' ' and i != '\n']
result_html = ''.join(result_html)
result = "".join([str(content) for content in result_html])
finalresult = ""+first_strong.text+" "+result.replace("<br>", "").replace("<br/>", "")+""
try:
descriptionlen = len(finalresult.split())#Working
except:
descriptionlen = "Not Found"
print(finalresult)
print("-------------------")
#---------------------------------------------------------------------------------------------------------------------------
try:
BOLD_TAGS = [
"Product Specification and Features",
"Product Specifications and Features",
"Product Specifications & Features",
"Product Specification & Feature"
]
benefits = None # Initialize benefits outside the loop to avoid scoping issues
for BOLD_TAG in BOLD_TAGS:
try:
benefits = maindiv.find(tag_used, text=BOLD_TAG).find_next_sibling("ul").find_all("li")
BOLD_TAG = BOLD_TAG
break # Exit the loop if benefits are found
except AttributeError:
pass # Ignore the exception and continue to the next BOLD_TAG
try:
benefits = maindiv.find(tag_used, text=BOLD_TAG + ":").find_next_sibling("ul").find_all("li")
BOLD_TAG = BOLD_TAG + ":" # Update BOLD_TAG if colon was added
break # Exit the loop if benefits are found
except AttributeError:
pass # Ignore the exception and continue to the next BOLD_TAG
try:
next_header_tag = maindiv.find(tag_used, text=BOLD_TAG).find_next_sibling(tag_used)
if next_header_tag:
next_header = next_header_tag.text
else:
next_header = maindiv.find(tag_used, text=BOLD_TAG).find_next_sibling("div")
except:
next_header = None
benefits = [benefit.text.strip() for benefit in benefits]
benefits_header = soup.find(tag_used, text=BOLD_TAG)
if benefits_header:
uls = benefits_header.find_next_siblings("ul")
benefits = []
for ul in uls:
# Check if the previous sibling is a <strong> or <b> tag and contains the text of the next header
if next_header and ul.find_previous_sibling(lambda tag: (tag_used) and next_header in tag.get_text(strip=True)):
break
# Check if the previous sibling is a <div> tag (your second condition)
elif ul.find_previous_sibling("div"):
break
benefits.extend([li.text.replace('\xa0', ' ') for li in ul.find_all("li")])
word_count = sum([len(benefit.split()) for benefit in benefits])
print(BOLD_TAG, benefits)
print(BOLD_TAG, "Count:-", word_count)
print("-------------------")
else:
print(BOLD_TAG, " not found")
print("-------------------")
except:
print("Error in ", BOLD_TAG)
print("-------------------")
0 comments:
Post a Comment