onurhuseyincantay · September 29, 2018 17:53
diff --git a/BeautifulSoap Usage Example b/BeautifulSoap Usage Example
 from bs4 import BeautifulSoup
 import urllib.request
 import os, ssl
 import re
 import pandas as pd
 def getDataFromUrl(text,tagForData=None,idForData=None
                   ,classForData=None,isSoup=False):
    #this creates an SSL Cerificate
    if (not os.environ.get('PYTHONHTTPSVERIFY', '') and
            getattr(ssl, '_create_unverified_context', None)):
        ssl._create_default_https_context = ssl._create_unverified_context
    url = text
    class_for_data = classForData
    url_response = urllib.request.urlopen(url)
    soup = BeautifulSoup(url_response, 'html.parser')
    if isSoup :
        return soup
    Datas = [p.text for p in soup.find_all(tagForData,class_for_data)]
    return Datas

 def getComments(baseUrl):
    try:
        datas = []
        soupfirst = getDataFromUrl(baseUrl+"-yorumlari",isSoup=True)
        span = soupfirst.find_all("span",{"itemprop":"title"})
        Category = span[1].text
        try:
            div = soupfirst.find("div",{"class":"pagination"})
            li = div.find_all("li")
            maxPageCount = int(li[-1].text)
        except Exception as e:
            maxPageCount = 0
        #yorumlar kısmı
        for x in range(1,maxPageCount+1):
            soup = getDataFromUrl(baseUrl+"-yorumlari?sayfa="+str(x),isSoup=True)
            ul = soup.find("ul",{"id":"reviews"})
            strongBaslik = ul.find_all("strong",{"class":"subject","itemprop":"name"})
            pReview = ul.find_all("p",{"class":"review-text","itemprop":"description"})
            pAnswer = soup.find_all("p",{"class":"answer"})
            ratings= ul.find_all("div",{"class":"ratings active"})
            data = dict()
            for i in range(len(strongBaslik)):
                baslik=strongBaslik[i].text
                yorum = pReview[i].text
                text = re.split(" ", pAnswer[i].text)
                txt1 = text[1].split("\n")
                txt2 = text[2].split("\n")
                yesScore = re.sub(r'\((.*)\)',r'\1',txt1[0])
                noScore = re.sub(r'\((.*)\)',r'\1',txt2[0])
                splitted = re.split(" ", ratings[i].attrs["style"])
                score = int(splitted[1][:-1])
                #print(score)
                #print(baslik)
                #print(yorum)
                #print("Evet "+yesScore)
                #print("Hayır "+noScore)
                data = {"Category":Category,"Rating":score,"Header":baslik,"Review":yorum,"YesScore":yesScore,"NoScore":noScore}
                datas.append(data.copy())
                data = dict()
    except Exception as e:
        print(e)
        return []
    return datas

 def getLinks(baseurl,start,stop,filename):
    normalLinks = []
    for i in range(start,stop):
        baseUrl = baseurl+"&sayfa="+i.__str__()
        data = getDataFromUrl(baseUrl,isSoup=True)
        try:
            ul =  data.find("ul",attrs={'class':"product-list"})
            a = ul.find_all("a")

            for item in a:
                if item.attrs["href"][0] != item.attrs["href"][1]:
                    normalLinks.append("https://www.hepsiburada.com"+item.attrs["href"])
                else:
                    continue
        except Exception as e:
            print(e)

    print("NormalLinks : {}",normalLinks)
    linksBilgisayar = "\n".join(normalLinks)
    text_file = open(filename+"_"+str(start)+"-"+str(stop), "w")
    text_file.write(linksBilgisayar)
    text_file.close()
 GeneralData = []
 with open("linkBilgisayar0-250.txt","r") as file :
    text = file.read()
    GeneralData = text.split("\n")
 Comments = []
 for x in range(1500,2500):
    print(x)
    Comments.extend(getComments(GeneralData[x]))
 df = pd.DataFrame(Comments)
 df.to_csv("GG-1500-2500.csv")
	from bs4 import BeautifulSoup
	import urllib.request
	import os, ssl
	import re
	import pandas as pd
	def getDataFromUrl(text,tagForData=None,idForData=None
	,classForData=None,isSoup=False):
	#this creates an SSL Cerificate
	if (not os.environ.get('PYTHONHTTPSVERIFY', '') and
	getattr(ssl, '_create_unverified_context', None)):
	ssl._create_default_https_context = ssl._create_unverified_context
	url = text
	class_for_data = classForData
	url_response = urllib.request.urlopen(url)
	soup = BeautifulSoup(url_response, 'html.parser')
	if isSoup :
	return soup
	Datas = [p.text for p in soup.find_all(tagForData,class_for_data)]
	return Datas

	def getComments(baseUrl):
	try:
	datas = []
	soupfirst = getDataFromUrl(baseUrl+"-yorumlari",isSoup=True)
	span = soupfirst.find_all("span",{"itemprop":"title"})
	Category = span[1].text
	try:
	div = soupfirst.find("div",{"class":"pagination"})
	li = div.find_all("li")
	maxPageCount = int(li[-1].text)
	except Exception as e:
	maxPageCount = 0
	#yorumlar kısmı
	for x in range(1,maxPageCount+1):
	soup = getDataFromUrl(baseUrl+"-yorumlari?sayfa="+str(x),isSoup=True)
	ul = soup.find("ul",{"id":"reviews"})
	strongBaslik = ul.find_all("strong",{"class":"subject","itemprop":"name"})
	pReview = ul.find_all("p",{"class":"review-text","itemprop":"description"})
	pAnswer = soup.find_all("p",{"class":"answer"})
	ratings= ul.find_all("div",{"class":"ratings active"})
	data = dict()
	for i in range(len(strongBaslik)):
	baslik=strongBaslik[i].text
	yorum = pReview[i].text
	text = re.split(" ", pAnswer[i].text)
	txt1 = text[1].split("\n")
	txt2 = text[2].split("\n")
	yesScore = re.sub(r'\((.*)\)',r'\1',txt1[0])
	noScore = re.sub(r'\((.*)\)',r'\1',txt2[0])
	splitted = re.split(" ", ratings[i].attrs["style"])
	score = int(splitted[1][:-1])
	#print(score)
	#print(baslik)
	#print(yorum)
	#print("Evet "+yesScore)
	#print("Hayır "+noScore)
	data = {"Category":Category,"Rating":score,"Header":baslik,"Review":yorum,"YesScore":yesScore,"NoScore":noScore}
	datas.append(data.copy())
	data = dict()
	except Exception as e:
	print(e)
	return []
	return datas

	def getLinks(baseurl,start,stop,filename):
	normalLinks = []
	for i in range(start,stop):
	baseUrl = baseurl+"&sayfa="+i.__str__()
	data = getDataFromUrl(baseUrl,isSoup=True)
	try:
	ul = data.find("ul",attrs={'class':"product-list"})
	a = ul.find_all("a")

	for item in a:
	if item.attrs["href"][0] != item.attrs["href"][1]:
	normalLinks.append("https://www.hepsiburada.com"+item.attrs["href"])
	else:
	continue
	except Exception as e:
	print(e)

	print("NormalLinks : {}",normalLinks)
	linksBilgisayar = "\n".join(normalLinks)
	text_file = open(filename+"_"+str(start)+"-"+str(stop), "w")
	text_file.write(linksBilgisayar)
	text_file.close()
	GeneralData = []
	with open("linkBilgisayar0-250.txt","r") as file :
	text = file.read()
	GeneralData = text.split("\n")
	Comments = []
	for x in range(1500,2500):
	print(x)
	Comments.extend(getComments(GeneralData[x]))
	df = pd.DataFrame(Comments)
	df.to_csv("GG-1500-2500.csv")