Created
January 22, 2021 05:27
-
-
Save rakeshtembhurne/34f6d186f0ad457361060ffd3efde064 to your computer and use it in GitHub Desktop.
delete
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
"use strict"; | |
const puppeteer = require("puppeteer"); | |
const _ = require('lodash'); | |
const log = data => console.log(data); | |
async function autoScroll(page){ | |
await page.evaluate(async () => { | |
await new Promise((resolve, reject) => { | |
let totalHeight = 0; | |
let distance = 1000; | |
let timer = setInterval(() => { | |
let scrollHeight = document.body.scrollHeight; | |
window.scrollBy(0, distance); | |
totalHeight += distance; | |
if(totalHeight >= scrollHeight){ | |
clearInterval(timer); | |
resolve(); | |
} | |
}, 1000); | |
}); | |
}); | |
} | |
function isEmpty(value) { | |
return ( | |
value === undefined || | |
value === null || | |
value === NaN || | |
(typeof value === 'object' && Object.keys(value).length === 0 )|| | |
(typeof value === 'string' && value.trim().length === 0) | |
) | |
} | |
function get(obj, path, def) { | |
path = path | |
.replace(/\[/g, '.') | |
.replace(/]/g, '') | |
.split('.'); | |
path.forEach(function (level) { | |
obj = obj[level]; | |
}); | |
if (obj === undefined) { | |
return def; | |
} | |
return obj; | |
}; | |
async function main() { | |
const url = "https://www.expedia.com.au/Sydney-Hotels-Shangri-La-Hotel.h11974.Hotel-Information?chkin=2021-01-29&chkout=2021-01-30&x_pwa=1&rfrr=HSR&pwa_ts=1610694652769&referrerUrl=aHR0cHM6Ly93d3cuZXhwZWRpYS5jb20uYXUvSG90ZWwtU2VhcmNo&useRewards=false&rm1=a2®ionId=178312&destination=Sydney+%28and+vicinity%29%2C+New+South+Wales%2C+Australia&destType=MARKET&neighborhoodId=6132025&trackingData=AAAAECNwmltY6Lf7yh2vFtnbNS7MoQH94W9EOm4f8HsXDajktBN-LFDIGJIiMR3aEYYZCwoqz2BI1Fz3Muwf5ZA47POaGKdEJ0-fCGkET8vsaRp_11FtevKgwiMtJ7mkZt-BIdxFMMuiF1oRr5uK-CqVxGkx8vZErSYXsQeSNwl4Ds93GvSjyYZRVv6ZItpseoYPjA-6SzY9MpdPmHSMYl6wSjx8kODFuxGooXum1OxN_VNa5ihRb9aaEKqhUO0Egq28oqKw4W_C5goxtZJj6-6VlIJPnYUYJFDOeQQZYxchv7ORw5h4-k7fAditKo6jYTdtTOlwiDhgaKNdH5BX0lDun_N0dyrg8KH8NYWqrzLCWg3AtIgC_wn75wfomBLY13qokhxe-Ab7U-3RL8srmFfZfyWi_Q6_CGZAUWpKQFX7dzn0zsoBykIZzd_pg4RHG8IliFor9lOHLW_A2EMcrDtYQhwRBDOnS4PyHyZ1nBcrXgE2-uSI0SU0YyH-GU6BKoP7QeE0t3czevPwxPOf2YuzLAYe83FVyyARO1VcXE-WrFB7BsuF4Carz-4w1il_XLnOwGq6o1f2cJn57xSX0RxaqugO13s_61do469ZP489Blkbqekjjn53FhAas3Rdqsq6KQFidEi6pow7CZOPHjceLzMnF6aES8KW_FOZa0UPBRZ64fwPRrdHnZdUu58F6F5-RovqHB6zaoVlYN_TI_d0Hx_8eY_BEtWyGxx9F086-RSetdkvZ3M5EXceoRvO1QBLdcevB6dFMD0eWls_kAy_X2FEnPogRUgpv1nbys3JcidwNfQHa-BaqOB7kqXa2OBb1YBp8t-6Qjs-INdXIZEl6r3clOhWKr4ZPAFWCUfRHoa536zq3dWceyj96Y8BnfvaGgpKU7hsWdSlABVrvniHZ1FZZO6x7RAwOG6J18BoHBXzmB_VgAheVJ9df-Ukg3qwXpVW4HhCFxTJnygMPGiEHMj3hWlcyRbfHQaoB8khIUzW1nGd2P9v-9_KH2sVu7fs3h7wHG3OciDJVlP5Lse6yZs%3D&rank=1&testVersionOverride=Buttercup%2C31936.102311.0%2C33775.98848.1%2C38414.114301.0%2C33739.99567.0%2C37898.109354.1%2C37930.113882.2%2C37949.107324.0%2C39046.114579.0&slots=HSR_A&position=1&beaconIssued=2021-01-15T07%3A10%3A51&sort=RECOMMENDED&top_dp=295&top_cur=AUD&semdtl=&selectedRoomType=202123389&selectedRatePlan=230207305"; | |
// const url = "https://www.expedia.com.au"; | |
// const url = "https://i-know-you-faked-user-agent.glitch.me/new-window"; | |
const HEADLESS = true; | |
const USER_AGENT = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36"; | |
const browser = await puppeteer.launch({ | |
headless: HEADLESS, | |
args: [ | |
"--disable-setuid-sandbox", | |
`--user-agent=${USER_AGENT}`, | |
// '--proxy-server=socks325655://127.0.0.1:9050', | |
], | |
ignoreHTTPSErrors: true | |
}); | |
const page = await browser.newPage(); | |
// Some Hacks | |
await page.setRequestInterceptionEnabled(true); | |
page.on('request', req => { | |
log({reqHeaders: req.headers()}); | |
let headers = req.headers; | |
headers['referer'] = 'http://www.expedia.com.au/'; | |
req.continue({ headers: headers }); | |
}); | |
await page.evaluateOnNewDocument(fakeUserAgent => { | |
Object.defineProperty(navigator, 'platform', { get: () => 'MacIntel'}); | |
Object.defineProperty(navigator, 'productSub', { get: () => '20030107'}); | |
Object.defineProperty(navigator, 'vendor', { get: () => 'Apple Computer, Inc.'}); | |
let open = window.open; | |
window.open = (...args) => { | |
let newPage = open(...args); | |
Object.defineProperty(newPage.navigator, 'userAgent', { get: () => fakeUserAgent}) | |
}; | |
window.open.toString = () => 'function open() { [native code] }'; | |
}, USER_AGENT); | |
await page.setUserAgent(USER_AGENT); | |
if (HEADLESS) { | |
await page.setUserAgent('Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.80 Safari/537.36'); | |
} | |
await page.setViewport({ width:1920, height:1008 }); | |
let visible = false; | |
await page.goto(url, { waitUntil: 'load', timeout: 0 })// Remove the timeout | |
await page.waitForSelector('.main-region', { visible: true, timeout: 3000 }) | |
.catch(err => { | |
log('Page not found') | |
visible = true | |
}) | |
// if (visible) { | |
// let folderToUpload = `tmp`; | |
// upsertDirectory(folderToUpload); | |
// upsertDirectory(`${folderToUpload}/${scriptTime}`); | |
// folderToUpload = `${folderToUpload}/${scriptTime}`; | |
// await page.screenshot({ | |
// path: `${folderToUpload}/${expedia_id}.png`, | |
// fullPage: true | |
// }); | |
// log(`${folderToUpload}/${expedia_id}.png`) | |
// await page.close(); | |
// return {} | |
// } | |
await page.waitForTimeout(3000) | |
await page.addScriptTag({ content: `${isEmpty} ${get}`}); | |
const cookies = await page._client.send('Network.getAllCookies'); | |
log(cookies); | |
if(_.isEmpty(page)) { | |
return {} | |
} | |
const window = await page.evaluate(() => window.__STATE__) | |
const propertieInfo = _.get(window, "currentHotel", {}); | |
await autoScroll(page); | |
const hotelDetails = await page.evaluate(async() => { | |
// Name | |
const hotelName = ( | |
!isEmpty(document.querySelector('h1.uitk-type-display-700')) && | |
!isEmpty(document.querySelector('h1.uitk-type-display-700').textContent) | |
) ? document.querySelector('h1.uitk-type-display-700').textContent : null; | |
// Reviews | |
const liListElements = document.querySelectorAll('li.uitk-tab'); | |
const reviewButton = Array.from(liListElements).find(ele => ele.textContent === "Reviews" ) | |
if( | |
!isEmpty(reviewButton) && | |
!isEmpty(reviewButton.querySelector('a')) && | |
typeof reviewButton.querySelector('a').click == 'function' | |
) { | |
reviewButton.querySelector('a').click() | |
} | |
await new Promise(function(resolve) { setTimeout(resolve, 1000) }); | |
const reviewBaseElement = document.getElementById('Reviews') | |
const reviewElement = !isEmpty(reviewBaseElement) ? | |
reviewBaseElement.querySelectorAll('.uitk-flex.uitk-flex-align-items-flex-end.uitk-flex-gap-one.uitk-type-200') : | |
[]; | |
const reviewScoreElement = !isEmpty(reviewBaseElement) ? | |
reviewBaseElement.querySelectorAll('.uitk-flex.uitk-flex-column.uitk-flex-item.uitk-flex-basis-half_width.all-y-margin-three') : | |
[]; | |
const reviewScoreArray = Array.from(reviewScoreElement).map(ele => { | |
const reviewType = ( | |
!isEmpty(ele.querySelector('.uitk-type-300')) && | |
!isEmpty(ele.querySelector('.uitk-type-300').textContent) | |
) ? ele.querySelector('.uitk-type-300').textContent : null | |
const value = ( | |
!isEmpty(ele.querySelector('h3')) && | |
!isEmpty(ele.querySelector('h3').textContent) | |
) ? ele.querySelector('h3').textContent : null | |
switch(reviewType) { | |
case 'Cleanliness': | |
return { cleanlinessOverMax: value } | |
case 'Amenities': | |
return { amenityScoreOverMax: value }; | |
case 'Staff & service': | |
return { serviceAndStaffOverMax: value }; | |
case 'Property conditions & facilities': | |
return { hotelConditionOverMax: value }; | |
default: | |
return {} | |
} | |
}) | |
const ratingCounts = Array.from(reviewElement).map(ele => { | |
const reviewType = ( | |
!isEmpty(ele.querySelector('.uitk-progress-bar-title')) && | |
!isEmpty(ele.querySelector('.uitk-progress-bar-title').textContent) | |
) ? | |
ele.querySelector('.uitk-progress-bar-title').textContent | |
: ""; | |
const value = ( | |
!isEmpty(ele.querySelector('.uitk-progress-bar-description')) && | |
!isEmpty(ele.querySelector('.uitk-progress-bar-description').textContent) | |
) ? | |
ele.querySelector('.uitk-progress-bar-description').textContent | |
: null | |
return { rating: get(reviewType.split(' - '), '0', ''), value } | |
}) | |
const reviewScoreDefault = { | |
cleanlinessOverMax: '0/5', | |
amenityScoreOverMax: '0/5', | |
serviceAndStaffOverMax: '0/5', | |
hotelConditionOverMax: '0/5', | |
} | |
const reviewScore = isEmpty(reviewScoreArray) ? reviewScoreDefault : Object.assign.apply(Object, reviewScoreArray); | |
const overallRating = ( | |
!isEmpty(reviewBaseElement) && | |
!isEmpty(reviewBaseElement.querySelector('.uitk-type-900.uitk-type-regular')) && | |
!isEmpty(reviewBaseElement.querySelector('.uitk-type-900.uitk-type-regular').querySelector('span')) && | |
!isEmpty(reviewBaseElement.querySelector('.uitk-type-900.uitk-type-regular').querySelector('span').textContent) | |
) ? | |
reviewBaseElement.querySelector('.uitk-type-900.uitk-type-regular').querySelector('span').textContent | |
: null; | |
const countData = ( | |
!isEmpty(reviewBaseElement) && | |
!isEmpty(reviewBaseElement.querySelector('.uitk-flex.uitk-flex-column.all-t-padding-one')) && | |
!isEmpty(reviewBaseElement.querySelector('.uitk-flex.uitk-flex-column.all-t-padding-one').querySelector('button')) && | |
!isEmpty(reviewBaseElement.querySelector('.uitk-flex.uitk-flex-column.all-t-padding-one').querySelector('button').textContent) | |
) ? | |
reviewBaseElement.querySelector('.uitk-flex.uitk-flex-column.all-t-padding-one').querySelector('button').textContent | |
: ''; | |
const count = get(countData.split(' '), '0', 0); | |
const superlative = ( | |
!isEmpty(reviewBaseElement) && | |
!isEmpty(reviewBaseElement.querySelector('.uitk-type-400.uitk-type-bold')) && | |
!isEmpty(reviewBaseElement.querySelector('.uitk-type-400.uitk-type-bold').textContent) | |
) ? | |
reviewBaseElement.querySelector('.uitk-type-400.uitk-type-bold').textContent | |
: null; | |
// About | |
const li_elements = document.querySelectorAll('.uitk-card-aloha-content-section') | |
const aboutBaseElement = Array.from(li_elements).find((element) => !isEmpty(element.querySelector('h2')) && element.querySelector('h2').textContent === 'About this property' ) | |
const aboutElement = !isEmpty(aboutBaseElement) ? | |
aboutBaseElement.querySelectorAll('.uitk-layout-grid-item.uitk-layout-grid-item-columnspan-medium-1.uitk-layout-grid-item-columnspan-large-2') : | |
[] | |
const about = Array.from(aboutElement).map(ele => { | |
const h3 = ( | |
!isEmpty(ele.querySelector('h3')) && | |
!isEmpty(ele.querySelector('h3').textContent) | |
) ? ele.querySelector('h3').textContent : '' | |
const body = ( | |
!isEmpty(ele.querySelector('.uitk-flex.uitk-flex-gap-one.uitk-flex-wrap.all-t-padding-two')) && | |
!isEmpty(ele.querySelector('.uitk-flex.uitk-flex-gap-one.uitk-flex-wrap.all-t-padding-two').textContent) | |
) ? ele.querySelector('.uitk-flex.uitk-flex-gap-one.uitk-flex-wrap.all-t-padding-two').textContent: '' | |
return (`${h3}\n${body}`).trim() | |
}) | |
// Address | |
const addressBaseElement = document.querySelector('.uitk-flex.uitk-flex-column.uitk-spacing-margin-large-inlinestart-three.uitk-layout-grid-item-columnspan-large-5') | |
const addressButton = !isEmpty(addressBaseElement) ? | |
addressBaseElement.querySelector('button.uitk-link.all-t-padding-two.uitk-link-layout-inline.uitk-type-300') : "" | |
const addressMetaTag = !isEmpty(addressButton) ? addressButton.querySelectorAll('meta'): []; | |
const overviewData = Array.from(addressMetaTag).map(ele => { | |
switch(ele.getAttribute('itemprop')) { | |
case 'name': | |
return { city: !isEmpty(ele.content) ? ele.content: null } | |
case 'addressRegion': | |
return { state: !isEmpty(ele.content) ? ele.content: null } | |
case 'streetAddress': | |
return { streetAddress: !isEmpty(ele.content) ? ele.content: null } | |
case 'addressCountry': | |
return { countryCode: !isEmpty(ele.content) ? ele.content: null } | |
case 'latitude': | |
return { latitude: !isEmpty(ele.content) ? ele.content: null } | |
case 'longitude': | |
return { longitude: !isEmpty(ele.content) ? ele.content: null } | |
} | |
}) | |
const addressLine = ( | |
!isEmpty(addressBaseElement) && | |
!isEmpty(addressBaseElement.querySelector('.uitk-flex-item.uitk-type-left.uitk-flex-grow-1')) && | |
!isEmpty(addressBaseElement.querySelector('.uitk-flex-item.uitk-type-left.uitk-flex-grow-1').querySelector('span')) && | |
!isEmpty(addressBaseElement.querySelector('.uitk-flex-item.uitk-type-left.uitk-flex-grow-1').querySelector('span').textContent) | |
) ? | |
addressBaseElement.querySelector('.uitk-flex-item.uitk-type-left.uitk-flex-grow-1').querySelector('span').textContent : | |
null; | |
overviewData.push({ addressLine }) | |
const overview = isEmpty(overviewData) ? {} :Object.assign.apply(Object, overviewData) | |
// Star Rating | |
const ratingBaseElement = document.querySelector('.uitk-rating'); | |
const svgLength = ( | |
!isEmpty(ratingBaseElement) && | |
!isNaN(ratingBaseElement.querySelectorAll('svg.uitk-icon.uitk-icon-xsmall').length) | |
) ? | |
ratingBaseElement.querySelectorAll('svg.uitk-icon.uitk-icon-xsmall').length | |
: 0; | |
const ratingContent = ( | |
!isEmpty(ratingBaseElement) && | |
!isEmpty(ratingBaseElement.querySelector('span')) && | |
!isEmpty(ratingBaseElement.querySelector('span').textContent) | |
) ? | |
ratingBaseElement.querySelector('span').textContent | |
: ''; | |
const starRatingValue = ratingContent.split(' ') | |
const stars = (svgLength === get(starRatingValue, '0', 0)) ? svgLength : starRatingValue[0]; | |
// Review score header | |
const reviewHeaderElement = document.querySelector('[data-stid="content-hotel-reviewsummary"]'); | |
const reviewMetaTag = !isEmpty(reviewHeaderElement) ? reviewHeaderElement.querySelectorAll('meta') : [] | |
const reviewData = Array.from(reviewMetaTag).map(ele => { | |
switch(ele.getAttribute('itemprop')) { | |
case 'ratingValue': | |
return { overallRating: !isEmpty(ele.content) ? ele.content: null } | |
case 'reviewCount': | |
return { count: !isEmpty(ele.content) ? ele.content: null } | |
case 'description': | |
return { superlative: !isEmpty(ele.content) ? ele.content: null } | |
} | |
}) | |
const reviewHeader = isEmpty(reviewData) ? {} :Object.assign.apply(Object, reviewData) | |
// Amenities | |
const amenitiesBaseElement = document.querySelector('[data-stid="hotel-amenities-list"]'); | |
if ( | |
!isEmpty(amenitiesBaseElement) && | |
!isEmpty(amenitiesBaseElement.querySelector('button')) && | |
typeof amenitiesBaseElement.querySelector('button').click == 'function' | |
) { | |
amenitiesBaseElement.querySelector('button').click() | |
} else { | |
console.log("Button not found") | |
} | |
await new Promise(function(resolve) { setTimeout(resolve, 1000) }); | |
// Scrape Dialoge Box | |
const amenitiesBaseDialogElement = document.querySelector('.uitk-dialog-layer.uitk-dialog-layer-responsive.layer-overlay-active') | |
const amenitiesArray = !isEmpty(amenitiesBaseDialogElement) ? | |
amenitiesBaseDialogElement.querySelectorAll('.uitk-spacing.uitk-spacing-padding-blockend-four') | |
: [] | |
let popularAmentiesListBaseElement = Array.from(amenitiesArray).find(ele => ele.querySelector('h3') && ele.querySelector('h3').textContent === 'Popular amenities') | |
if (isEmpty(popularAmentiesListBaseElement)) { | |
popularAmentiesListBaseElement = Array.from(amenitiesArray).find(ele => ele.querySelector('h3') === null ) | |
} | |
const popularAmenitiesList = !isEmpty(popularAmentiesListBaseElement) ? | |
popularAmentiesListBaseElement.querySelectorAll('li') : []; | |
const topSummary = Array.from(popularAmenitiesList).map(ele => { | |
return { description: ele.textContent } | |
}) | |
return { | |
hotel: { | |
additionalInfo: { | |
propertyDescription: { | |
hotelDescription: about.join('\n'), | |
about | |
} | |
}, | |
amenities: { | |
topSummary | |
}, | |
overview, | |
hotelName, | |
stars | |
}, | |
reviews: { | |
overallRating, | |
ratingCounts, | |
count, | |
superlative, | |
...reviewScore | |
}, | |
reviewHeader, | |
} | |
}); | |
console.log(JSON.stringify(hotelDetails, null, 2)); | |
}; | |
main(); |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment