Created
December 10, 2015 19:55
-
-
Save pjha1994/05a852430427738f44c0 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| import re | |
| from bs4 import BeautifulSoup | |
| from bs4 import SoupStrainer | |
| import os | |
| import httplib2 | |
| from datetime import datetime | |
| c=0 | |
| def make_soup(s): | |
| #test_internet() | |
| match=re.compile('https://|http://|www.|.com|.in|.org|gov.in') | |
| if re.search(match,s): | |
| http = httplib2.Http() | |
| status, response = http.request(s) | |
| page = BeautifulSoup(response,"html.parser",parse_only=SoupStrainer('div'))#,parse_only=SoupStrainer('div') | |
| return page | |
| else: | |
| return None | |
| def test_internet(): | |
| while(True): | |
| try: | |
| test_in=make_soup("https://www.google.com") | |
| #print('here in inteenet') | |
| break | |
| except: | |
| continue | |
| def parse1(s): | |
| global c | |
| temp_set=set() | |
| soup=make_soup(s) | |
| if(soup!=None): | |
| for div in soup.find_all('div',class_=[ "thing" , "id-t3_3ua12m" ,"linkflair" , "linkflair-normal" , "odd" , "link"]): | |
| try: | |
| if(div.p!=None and div.p.next_sibling!=None and div.p.next_sibling.next_sibling!=None): | |
| x=div.p.next_sibling.next_sibling.next_sibling['class'] | |
| #print(x) | |
| if(x[0]=='entry'): | |
| element='\nPROMPT '+str(c+1)+'\n' | |
| if(div.p.next_sibling.next_sibling.next_sibling!=None and div.p.next_sibling.next_sibling.next_sibling.p!=None and div.p.next_sibling.next_sibling.next_sibling.p.a!=None): | |
| element=element+div.p.next_sibling.next_sibling.next_sibling.p.a.string+'\n' | |
| element=element+div.p.next_sibling.next_sibling.next_sibling.p.a['href']+'\n' | |
| if(div.p.next_sibling.next_sibling.next_sibling.find('p',{'class':'tagline'})!=None and div.p.next_sibling.next_sibling.next_sibling.find('p',{'class':'tagline'}).time!=None): | |
| element=element+div.p.next_sibling.next_sibling.next_sibling.find('p',{'class':'tagline'}).time['datetime']+'\t' | |
| element=element+div.p.next_sibling.next_sibling.next_sibling.find('p',{'class':'tagline'}).time['title']+'\t' | |
| element=element+div.p.next_sibling.next_sibling.next_sibling.find('p',{'class':'tagline'}).time.string+'\n' | |
| if(div.p.next_sibling.next_sibling.next_sibling.find('p',{'class':'tagline'})!=None and div.p.next_sibling.next_sibling.next_sibling.find('p',{'class':'tagline'}).a!=None): | |
| element=element+div.p.next_sibling.next_sibling.next_sibling.find('p',{'class':'tagline'}).a.string+'\n' | |
| element=element+div.p.next_sibling.next_sibling.next_sibling.find('p',{'class':'tagline'}).text+'\n' | |
| if(div.div.find('div',{'class':'score likes'})!=None): | |
| element=element+'score likes '+div.div.find('div',{'class':'score likes'}).string+'\t' | |
| element=element+'score dislikes '+div.div.find('div',{'class':'score dislikes'}).string+'\t' | |
| element=element+'score unvoted '+div.div.find('div',{'class':'score unvoted'}).string+'\n\n' | |
| f.write(element) | |
| c=c+1 | |
| elif(x[0]=='thumbnail'): | |
| element='\nPROMPT '+str(c+1)+'\n' | |
| if(div.find('div',{'class':'entry unvoted'})!=None and div.find('div',{'class':'entry unvoted'}).p!=None and div.find('div',{'class':'entry unvoted'}).p.a!=None and div.find('div',{'class':'entry unvoted'}).p.a.string!=None): | |
| element=element+div.find('div',{'class':'entry unvoted'}).p.a.string+'\n' | |
| element=element+div.find('div',{'class':'entry unvoted'}).p.a['href']+'\n' | |
| if(div.find('div',{'class':'entry unvoted'}).find('p',{'class':'tagline'})!=None and div.find('div',{'class':'entry unvoted'}).find('p',{'class':'tagline'}).time != None): | |
| element=element+div.find('div',{'class':'entry unvoted'}).find('p',{'class':'tagline'}).time['datetime']+'\t' | |
| element=element+div.find('div',{'class':'entry unvoted'}).find('p',{'class':'tagline'}).time['title']+'\t' | |
| element=element+div.find('div',{'class':'entry unvoted'}).find('p',{'class':'tagline'}).time.string+'\n' | |
| if(div.find('div',{'class':'entry unvoted'}).find('p',{'class':'tagline'}).a!=None): | |
| element=element+div.find('div',{'class':'entry unvoted'}).find('p',{'class':'tagline'}).a.string+'\n' | |
| element=element+div.find('div',{'class':'entry unvoted'}).find('p',{'class':'tagline'}).text+'\n' | |
| if(div.p.next_sibling.next_sibling.find('div',{'class':'score likes'})!=None and div.p.next_sibling.next_sibling.find('div',{'class':'score dislikes'})!=None and div.p.next_sibling.next_sibling.find('div',{'class':'score unvoted'})!=None): | |
| element=element+'score likes '+div.p.next_sibling.next_sibling.find('div',{'class':'score likes'}).string+'\t\t' | |
| element=element+'score dislikes '+div.p.next_sibling.next_sibling.find('div',{'class':'score dislikes'}).string+'\t\t' | |
| element=element+'score unvoted '+div.p.next_sibling.next_sibling.find('div',{'class':'score unvoted'}).string+'\n' | |
| f.write(element) | |
| c=c+1 | |
| except: | |
| print('ERROR') | |
| continue | |
| def count_next_of_current(s,m): | |
| test_internet() | |
| soup=make_soup(s) | |
| y='https://www.reddit.com/r/'+m+'/'+select_tab+'/?count=' | |
| match=re.compile(y) | |
| for link in soup.find_all('a',{'rel':['next']}): | |
| href=link['href'] | |
| return href | |
| def read_reddit_images(change_file_number,m,x): | |
| test_internet() | |
| global f | |
| global select_tab | |
| select_tab=x | |
| #x=m+'_'+select_tab+str(change_file_number)+'.txt' | |
| x=m+'_'+select_tab+'.txt' | |
| f=open(x,'a',encoding='utf-8') | |
| FORMAT = '%d-%m-%Y %H:%M:%S' | |
| f.write('\n\n\n\niteration number '+str(change_file_number)+' '+datetime.now().strftime(FORMAT)+'\n\n') | |
| maximum_number_of_next_pages=7 | |
| s='https://www.reddit.com/r/'+m+'/'+select_tab | |
| soup=make_soup(s) | |
| parse1(s) | |
| count=0 | |
| print('for '+m+' '+select_tab+' current page number is'+'\n'+str(count)) | |
| while(count<maximum_number_of_next_pages): | |
| test_internet() | |
| s=count_next_of_current(s,m) | |
| if(s!=None): | |
| parse1(s) | |
| count=count+1 | |
| print(count) | |
| else: | |
| break | |
| f.write('\n\niteration number '+str(change_file_number)+' '+datetime.now().strftime(FORMAT)+'\n\n') | |
| f.close() | |
| def maincall(m,i): | |
| #test_internet() | |
| read_reddit_images(i,m,'hot') | |
| #test_internet() | |
| read_reddit_images(i,m,'new') | |
| #test_internet() | |
| read_reddit_images(i,m,'top') | |
| #test_internet() | |
| read_reddit_images(i,m,'rising') | |
| #test_internet() | |
| read_reddit_images(i,m,'controversial') | |
| #test_internet() | |
| read_reddit_images(i,m,'gilded') | |
| def subs(b): | |
| test_internet() | |
| t=open('mytext.txt','r') | |
| i=t.read() | |
| temp=int(i) | |
| temp=temp+1 | |
| t.close() | |
| t=open('mytext.txt','w') | |
| t.write(str(temp)) | |
| t.close() | |
| for k in b: | |
| test_internet() | |
| maincall(k,i) | |
| def main(): | |
| test_internet() | |
| #print('here') | |
| b=[] | |
| b=['24hoursupport','3amjokes','ADHD','AMA','AcademicPhilosophy','AcademicPsychology','Aerospace','Android','AndroidQuestions','Anger','Anxiety', | |
| 'AskAnthropology','AskComputerScience','AskElectronics','AskEngineers','AskHR','AskHistorians','AskMen','AskPhysics','AskReddit','AskScienceDiscussion', | |
| 'AskScienceFiction','AskSocialScience','AskWomen','Ask_Politics','Bash','BehavioralEconomics','BigDataJobs','BipolarReddit','CAD','C_Programming', | |
| 'ComputerScience','Confession','CoverTheWorld','Cplusplus','CppForbeginners','CrappyDesign','CrazyIdeas','DIY','DIYCompSci','DailyProgrammer','DeadBedrooms', | |
| 'DebateReligion','DecidingToBeBetter','DigitalNomad','DoesNotTranslate','ECE','Economics','EngineeringStudents','Entrepreneur','ExNoContact','FEA','FE_Exam', | |
| 'Feminism','FluidMechanics','Foodforthought','FoundWords','Freethought','GetMotivated','GetStudying','GraphicsProgramming','HITsWorthTurkingFor','HTMLBattles', | |
| 'HomeworkHelp','HowsYourJob','IAmA','IOPsychology','InternetIsBeautiful','LaTeX','LanguageLearning','LearnANewLanguage','LearnJava','LearnJavaScript', | |
| 'LifeProTips','LinguisticsHumor','LongDistance','MachineLearning','Manufacturing','MathHelp','Meditation','NetworkingJobs','Neuropsychology','NoStupidQuestions', | |
| 'ObjectiveC','PCMasterRace','PLC','PhilosophyofScience','PhsychologicalTricks','PoliticalDiscussion','Polyamory','PrintedCircuitBoard','Progether', | |
| 'ProgrammerHumor','Proofreading','Python','RapeCounseling','RetailManagement','STEMdents','SWORDS','SWResources','SampleSize','SanctionedSuicide','Seduction', | |
| 'SiblingSupport','Statistics','SuicideWatch','Swift','SysadminJobs','TechNews','ThermalPerformance','Tinder','TinyCode','TowerOfBabel','TrueAskReddit', | |
| 'TrueReddit','Unix','VentureBiotech','WeMetOnline','Web_Development','WhatsTheWord','YoungJobs','academicpsychology','academicpublishing','accounting','advice', | |
| 'androiddev','translator','answers','asklinguistics','askmath','askphotography','askreddit','askscience','assistance','astronomy','audiology','autism','badcode', | |
| 'badlinguistics','beermoney','behavioralmedicine','behaviortherapy','bestof','bestofTLDR','bioengineering','biology','biotech','bodybuilding','bookquotes', | |
| 'books','breadboard','bugs','buildapc','business','careerguidance','cfd','changemyview','chemicalengineering','chipdesign','civilengineering','cloudcomputing', | |
| 'coding','coffeescript','cogneuro','cogneurocogsci','cognitivelinguistics','cogsci','compilers','complexsystems','compling','compression','compsci', | |
| 'computerforensics','computers','computerscience','conlangs','conspiracy','construction','cosmology','coursearea','cpp','cpp_questions','crypto','cryptography', | |
| 'cs50','csbooks','cscareerquestions','csharp','css','dae','dailyprogrammer','dailyscripts','darkinternet','dataisbeautiful','datamining','dementia','depression', | |
| 'diy','documentaries','dotnet','downsyndrome','dyslexia','economics','education','eebooks','electricalengineering','electronics','engineering', | |
| 'engineeringtechnology','entrepreneur','epidemiology','etymology','eurodiversity','everythingscience','evolution','evopsych','explainlikeimfive','favors', | |
| 'finance','financialindependence','findareddit','forhire','forth','freelance','freelanceUK','freelanceWriters','funny','gadgets','genetics','getdisciplined', | |
| 'getemployed','getmotivated','getting_over_it','goldredditsays','grammar','grammarwriting','graphic_design','hacking','hardware','history','holdmybeer', | |
| 'homeworkhelp','html','htmlbasics','humanism','hwstartups','hypotheticalsituation','iWantToLearn','ideasfortheadmins','illegaltorrents','improvevocab','india', | |
| 'ineedafavor','intel','intelligence','interview','inventions','iwantoutjobs','java','javaTIL','javacodegeeks','javahelp','javascript','jobbit','jobsearchhacks', | |
| 'jokes','jquery','languagetechnology','learnjava','learnjavascript','learnmath','learnprogramming','learnpython','lectures','lifehacks','linguistics','linux', | |
| 'linux4noobs','linuxquestions','literature','logic','machinelearning','marketing','masculism','math','mathbooks','mathematics','mathpsych','matlab', | |
| 'mechanicalengineering','medicine','meditation','mentalhealth','mentors','metalworking','microsoft','mmfb','motivation','movies','music','mysql','needadvice', | |
| 'networking','neuro','neurodiversity','neurophilosophy','neuropsychology','newproducts','news','newtoreddit','nonprofit_jobs','nootropics','obvious', | |
| 'occupationaltherapy','ocd','offmychest','opengl','osdev','parkrangers','perl','philosophy','philosophyofScience','philosophyofscience','php','physics','pics', | |
| 'politics','privacy','product_design','productivity','programbattles','programming','programmingbuddies','programmingchallenges','psychiatry','psychology', | |
| 'psychopharmacology','psychotherapy','psychscience','puzzles','python','quotes','rage','rational','reasonstolive','rehabtherapy','relationship_advice', | |
| 'relationships','resumes','riddles','robotics','ruby','saneorpsycho','schizophrenia','science','scientificresearch','self','selfhelp','selfimprovement','sex', | |
| 'shittyaskscience','shittyideas','shittyprogramming','showerthoughts','simpleliving','slp','socialism','socialmedia','socialskills','sociology','software', | |
| 'softwarearchitecture','softwaredevelopment','softwaregore','solotravel','space','specialed','startups','stopselfharm','suicidology','sysadmin','systems', | |
| 'talesfromtechsupport','technology','techsupport','teenagers','testimonials','themixednuts','thisismyjob','tipofmytongue','todayilearned','tr', | |
| 'translationstudies','travel','tutor','ultralight','undelete','undeleteShadow','undergraduateresearch','uniqueminds','visualbasic','web_programming','webdev', | |
| 'whatisthis','whatstheword','windows','windowsazure','womenEngineers','words','work','workonline','worldnews','writingprompts']#major list, Once a week | |
| #b=[] | |
| #b=['AskAnthropology','AskScienceDiscussion', | |
| # 'AskScienceFiction','AskSocialScience','Ask_Politics','ECE','Economics', | |
| # 'Freethought', | |
| # 'GetMotivated','GetStudying','GraphicsProgramming','Neuropsychology','NoStupidQuestions','PhsychologicalTricks', | |
| # 'PoliticalDiscussion','Web_Development','badcode', | |
| # 'biology','books','bugs','buildapc','compilers', | |
| # 'computers','computerscience','crypto', | |
| # 'cryptography','cs50','csbooks','cscareerquestions','dailyprogrammer', | |
| # 'dailyscripts','electronics','explainlikeimfive','grammar','hacking', | |
| # 'history','linux', | |
| # 'linux4noobs','linuxquestions','logic','mysql','networking', | |
| # 'opengl','philosophy','philosophyofScience','politics', | |
| # 'productivity','programmingchallenges', | |
| # 'shittyaskscience','shittyideas','shittyprogramming','showerthoughts', | |
| # 'socialism','socialskills','software']#regular list,Everyday | |
| #['AskScienceFiction','AskSocialScience','Ask_Politics','ECE','Economics', | |
| ## 'GraphicsProgramming' | |
| b=[] | |
| b=['AskAnthropology','AskScienceDiscussion', | |
| 'AskSocialScience', | |
| 'Ask_Politics','badcode','biology','compilers', | |
| 'computers','computerscience','crypto', | |
| 'cryptography','cs50','csbooks', | |
| 'cscareerquestions','dailyprogrammer','electronics','history', | |
| 'linux','linux4noobs','linuxquestions','logic','psychology']#basic_LONG INTERVAL | |
| l=set() | |
| for k in b: | |
| l.add(k) | |
| b=[] | |
| for k in l: | |
| b.append(k) | |
| b.sort() | |
| subs(b) | |
| #xcv=0 | |
| #print('[',sep='',end='') | |
| #for k in b: | |
| # xcv=xcv+1 | |
| # print("'",k,"',",sep='',end='') | |
| # #print(str(xcv),k,'\t\t') | |
| #print(']') | |
| #list_subreddits() | |
| if __name__ == '__main__': | |
| main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment