otc scrapper new

from bs4 import BeautifulSoup
import requests
import pandas as pd
from bs4 import Tag
import time

headers = {
    'Accept-Encoding': 'gzip, deflate, sdch',
    'Accept-Language': 'en-US,en;q=0.8',
    'Upgrade-Insecure-Requests': '1',
    'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36',
    'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
    'Cache-Control': 'max-age=0',
    'Connection': 'keep-alive',
}


url = "https://www.1mg.com/otc/small-wonder-admire-baby-feeding-bottle-small-green-otc499746?wpsrc=Google+Organic+Search"
source = requests.get (url, allow_redirects=False, headers=headers)
soup = BeautifulSoup(source.text,'html.parser')
title = soup.find('h1', class_="ProductTitle__product-title___3QMYH")
maindiv = soup.find('div', class_="ProductDescription__description-content___A_qCZ")

first_strong = maindiv.find('strong')
if first_strong is None:
    first_strong = maindiv.find('b')
    tag_used = 'b'
else:
    tag_used = 'strong'



#------------------------------------USES
if first_strong:
    second_strong = first_strong.find_next(tag_used).find_next(tag_used)
    if second_strong:
        if second_strong.text in ["Key Ingredients:", "Key Benefits:", "Directions For Use:", "Safety Information:", "Product Specifications and Features:",
                                  "Key Ingredients", "Key Ingredient:", "Key Benefits", "Directions For Use", "Safety Information", "Product Specifications and Features",
                                  "Brief Details:", "Brief Details", "Basic Details:", "Basic Details"]:
            first_strong_index = maindiv.contents.index(first_strong)
            second_strong_index = maindiv.contents.index(second_strong)
            result_html = maindiv.contents[first_strong_index + 2:second_strong_index]
            if any(tag.name == tag_used and "Uses" in tag.text for tag in result_html):#agar uses milega isko <b> ke ya <strong> ke baad to hi work krega ye
                result_html = [item for item in result_html if not (isinstance(item, Tag) and 'Uses' in item.text)]
                if not any(tag.name == 'ul' or tag.name == 'li' for tag in result_html):#Agar hamare result me ul and li tags honge to us case me error dega.
                    result_html = [i.text.strip() for i in result_html if i != ' ' and i != '\n']
                    result_html = ''.join(result_html)
                    
                    result = "".join([str(content) for content in result_html])
                    finalresult = ""+result.replace("<br>", "").replace("<br/>", "")+""
                    try:
                        descriptionlen = len(finalresult.split())#Working
                    except:
                        descriptionlen = "Not Found"
                    print(finalresult)
                    print("-------------------")





#-------------------------------DESCRIPTION---------
if first_strong:
    second_strong = first_strong.find_next(tag_used)
    if second_strong:
        if second_strong.text in ["Key Ingredients:", "Key Benefits:", "Directions For Use:", "Safety Information:", "Uses:", "Product Specifications and Features:",
                                  "Key Ingredients", "Key Ingredient:", "Key Benefits", "Directions For Use", "Safety Information", "Uses", "Product Specifications and Features",
                                  "Brief Details:", "Brief Details", "Basic Details:", "Basic Details",
                                  "Product Specification and Features", "Product Specification and Features:"]:
            first_strong_index = maindiv.contents.index(first_strong)
            second_strong_index = maindiv.contents.index(second_strong)
            result_html = maindiv.contents[first_strong_index + 1:second_strong_index]
            if not any(tag.name == 'ul' or tag.name == 'li' for tag in result_html):
                result_html = [i.text.strip() for i in result_html if i != ' ' and i != '\n']
                result_html = ''.join(result_html)

                result = "".join([str(content) for content in result_html])
                finalresult = ""+first_strong.text+" "+result.replace("<br>", "").replace("<br/>", "")+""
                try:
                    descriptionlen = len(finalresult.split())#Working
                except:
                    descriptionlen = "Not Found"
                print(finalresult)
                print("-------------------")







#---------------------------------------------------------------------------------------------------------------------------

                try:
                    BOLD_TAGS = [
                        "Product Specification and Features",
                        "Product Specifications and Features",
                        "Product Specifications & Features",
                        "Product Specification & Feature"
                    ]

                    benefits = None  # Initialize benefits outside the loop to avoid scoping issues

                    for BOLD_TAG in BOLD_TAGS:
                        try:
                            benefits = maindiv.find(tag_used, text=BOLD_TAG).find_next_sibling("ul").find_all("li")
                            BOLD_TAG = BOLD_TAG
                            break  # Exit the loop if benefits are found
                        except AttributeError:
                            pass  # Ignore the exception and continue to the next BOLD_TAG

                        try:
                            benefits = maindiv.find(tag_used, text=BOLD_TAG + ":").find_next_sibling("ul").find_all("li")
                            BOLD_TAG = BOLD_TAG + ":"  # Update BOLD_TAG if colon was added
                            break  # Exit the loop if benefits are found
                        except AttributeError:
                            pass  # Ignore the exception and continue to the next BOLD_TAG
                    
                    
                    try:
                        next_header_tag = maindiv.find(tag_used, text=BOLD_TAG).find_next_sibling(tag_used)
                        if next_header_tag:
                            next_header = next_header_tag.text
                        else:
                            next_header = maindiv.find(tag_used, text=BOLD_TAG).find_next_sibling("div")
                    except:
                        next_header = None
                        
                        
                    benefits = [benefit.text.strip() for benefit in benefits]
                    benefits_header = soup.find(tag_used, text=BOLD_TAG)


                    if benefits_header:
                        uls = benefits_header.find_next_siblings("ul")
                        benefits = []
                        for ul in uls:
                            # Check if the previous sibling is a <strong> or <b> tag and contains the text of the next header
                            if next_header and ul.find_previous_sibling(lambda tag: (tag_used) and next_header in tag.get_text(strip=True)):
                                break
                            # Check if the previous sibling is a <div> tag (your second condition)
                            elif ul.find_previous_sibling("div"):
                                break
                            benefits.extend([li.text.replace('\xa0', ' ') for li in ul.find_all("li")])


                        word_count = sum([len(benefit.split()) for benefit in benefits])
                        print(BOLD_TAG, benefits)

                        print(BOLD_TAG, "Count:-", word_count)
                        print("-------------------")
                        
                    else:
                        print(BOLD_TAG, " not found")
                        print("-------------------")
                except:
                    print("Error in ", BOLD_TAG)
                    print("-------------------")
otc scrapper new

0 comments:

Post a Comment

Populars

Archives

otc scrapper new

Next

Newer Post

Previous

Older Post

0 comments:

Post a Comment