Skip to content

Instantly share code, notes, and snippets.

@onurhuseyincantay
Created September 29, 2018 17:53
Show Gist options
  • Save onurhuseyincantay/6b489eb53a5804abc29848530331f6a0 to your computer and use it in GitHub Desktop.
Save onurhuseyincantay/6b489eb53a5804abc29848530331f6a0 to your computer and use it in GitHub Desktop.
from bs4 import BeautifulSoup
import urllib.request
import os, ssl
import re
import pandas as pd
def getDataFromUrl(text,tagForData=None,idForData=None
,classForData=None,isSoup=False):
#this creates an SSL Cerificate
if (not os.environ.get('PYTHONHTTPSVERIFY', '') and
getattr(ssl, '_create_unverified_context', None)):
ssl._create_default_https_context = ssl._create_unverified_context
url = text
class_for_data = classForData
url_response = urllib.request.urlopen(url)
soup = BeautifulSoup(url_response, 'html.parser')
if isSoup :
return soup
Datas = [p.text for p in soup.find_all(tagForData,class_for_data)]
return Datas
def getComments(baseUrl):
try:
datas = []
soupfirst = getDataFromUrl(baseUrl+"-yorumlari",isSoup=True)
span = soupfirst.find_all("span",{"itemprop":"title"})
Category = span[1].text
try:
div = soupfirst.find("div",{"class":"pagination"})
li = div.find_all("li")
maxPageCount = int(li[-1].text)
except Exception as e:
maxPageCount = 0
#yorumlar kısmı
for x in range(1,maxPageCount+1):
soup = getDataFromUrl(baseUrl+"-yorumlari?sayfa="+str(x),isSoup=True)
ul = soup.find("ul",{"id":"reviews"})
strongBaslik = ul.find_all("strong",{"class":"subject","itemprop":"name"})
pReview = ul.find_all("p",{"class":"review-text","itemprop":"description"})
pAnswer = soup.find_all("p",{"class":"answer"})
ratings= ul.find_all("div",{"class":"ratings active"})
data = dict()
for i in range(len(strongBaslik)):
baslik=strongBaslik[i].text
yorum = pReview[i].text
text = re.split(" ", pAnswer[i].text)
txt1 = text[1].split("\n")
txt2 = text[2].split("\n")
yesScore = re.sub(r'\((.*)\)',r'\1',txt1[0])
noScore = re.sub(r'\((.*)\)',r'\1',txt2[0])
splitted = re.split(" ", ratings[i].attrs["style"])
score = int(splitted[1][:-1])
#print(score)
#print(baslik)
#print(yorum)
#print("Evet "+yesScore)
#print("Hayır "+noScore)
data = {"Category":Category,"Rating":score,"Header":baslik,"Review":yorum,"YesScore":yesScore,"NoScore":noScore}
datas.append(data.copy())
data = dict()
except Exception as e:
print(e)
return []
return datas
def getLinks(baseurl,start,stop,filename):
normalLinks = []
for i in range(start,stop):
baseUrl = baseurl+"&sayfa="+i.__str__()
data = getDataFromUrl(baseUrl,isSoup=True)
try:
ul = data.find("ul",attrs={'class':"product-list"})
a = ul.find_all("a")
for item in a:
if item.attrs["href"][0] != item.attrs["href"][1]:
normalLinks.append("https://www.hepsiburada.com"+item.attrs["href"])
else:
continue
except Exception as e:
print(e)
print("NormalLinks : {}",normalLinks)
linksBilgisayar = "\n".join(normalLinks)
text_file = open(filename+"_"+str(start)+"-"+str(stop), "w")
text_file.write(linksBilgisayar)
text_file.close()
GeneralData = []
with open("linkBilgisayar0-250.txt","r") as file :
text = file.read()
GeneralData = text.split("\n")
Comments = []
for x in range(1500,2500):
print(x)
Comments.extend(getComments(GeneralData[x]))
df = pd.DataFrame(Comments)
df.to_csv("GG-1500-2500.csv")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment