Created
October 4, 2021 09:30
-
-
Save misner/a37428b437113741c244e2edf4cb51ef to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
//current code | |
import { | |
ErrorHandlingHOF, | |
log, | |
stringExtractedIsInValid, | |
organicExtractData_LogIfValueIsInvalid } from "./utils/index.js"; | |
import { | |
slugifyStr, | |
getFirstNWordsInStr, | |
longestWord } from "./utils/stringManipulations.js"; | |
import { | |
COUNTRY_CODE_ISO3166_MAPPING } from "./utils/countryNameToCountryIso3166Code.js"; | |
import { | |
addWebsiteNameTrackingParams, | |
getLwnRootDomain } from "./utils/urlManipulations.js"; | |
import { WEBSITE_NAME_INCLUDING_DEV_MODE } from "./utils/fnConstants.js"; | |
/* Database | |
*/ | |
//import DB from "baqend"; | |
var DB = require("baqend"); | |
const DB_NAME = 'Job'; | |
let connecting; | |
const connect_to_baqend = (app) => { | |
if (typeof connecting === 'object') { | |
log("on this lambda execution, the baqend database is already connected thanks to shared-memory ...") | |
return connecting; | |
} | |
connecting = DB.connect(app, true).then(() => { | |
console.log("Baqend Connected. Awaiting login.."); | |
//DB.__baqend_connection = true; //using this variable to avoid "user is already logged in" issue from baqend | |
//using promise to avoid "user is already logged in" issue from baqend | |
const loginIfNecessaryPromise = !DB.User.me | |
? DB.User.login( | |
"aws-to-baqend", | |
process.env.AWS_TO_BAQEND | |
) | |
: Promise.resolve(); | |
return loginIfNecessaryPromise | |
.then(function() { | |
// Work! | |
console.log('Logged into Baqend. Connection is ready to be used...') | |
}) | |
.catch(function(loginIfNecessaryError) { | |
console.log("loginIfNecessaryPromise", loginIfNecessaryPromise); | |
}); | |
}); | |
return connecting; | |
}; | |
const handler = ErrorHandlingHOF(async function(event, context, callback) { | |
//console.log('>>>>>', DB_NAME, DB[DB_NAME]); | |
const { | |
position, | |
link, | |
pubDate, | |
companyName, | |
jobLocationCity, | |
base_url, | |
extract_data_type, | |
base_url_country, | |
jkNumber, | |
description, | |
cleanFinalApplyDestinationUrl, | |
tagsMatched, | |
companyCertainDomain, | |
companyClearbitAutocompApproxDomain, | |
companyCertainLogoUrl, | |
companyClearbitAutocomppeApproxLogoUrl | |
} = event | |
log("Got", { | |
position, | |
link, | |
pubDate, | |
companyName, | |
jobLocationCity, | |
base_url, | |
extract_data_type, | |
base_url_country, | |
jkNumber, | |
description, | |
companyCertainDomain, | |
cleanFinalApplyDestinationUrl, | |
tagsMatched, | |
companyClearbitAutocompApproxDomain, | |
companyClearbitAutocomppeApproxLogoUrl | |
}); | |
/* Standardization & create the inputs for the database | |
Note: keep same order as columns in the database for easier work on db<->code | |
*/ | |
/************* value for listing_id *************************/ | |
//not performed on Lambda level but on db level (on Baqend performed by "Modules") | |
//only solution to avoid concurrency issues and allow the "last mile"=the db | |
//to make sure it never sets the same value for 2 jobs | |
/************* standardize extract_data_type *************************/ | |
const jobDataSourceType = extract_data_type; | |
//check is string and not empty | |
if ( stringExtractedIsInValid(jobDataSourceType) ) { | |
organicExtractData_LogIfValueIsInvalid("extracttion data type"); | |
return; | |
} | |
//check is authorized value | |
let possibleDataExtractionType = ['scraping', 'api']; | |
//put in utils | |
function isAuthorizedValue(text, acceptedValues) { | |
return acceptedValues.indexOf(text) !== -1; | |
} | |
if (!isAuthorizedValue(jobDataSourceType, possibleDataExtractionType)) { | |
organicExtractData_LogIfValueIsInvalid("extracttion data type"); | |
return; | |
} | |
log("jobDataSourceType is : " + jobDataSourceType); | |
/************* standardize listing_master_scraping_query *************************/ | |
const jobMasterScrapingQuery = base_url; | |
//check is string and not empty | |
if ( stringExtractedIsInValid(jobMasterScrapingQuery) ) { | |
organicExtractData_LogIfValueIsInvalid("jobMasterScrapingQuery"); | |
return; | |
} | |
log("jobMasterScrapingQuery is : " + jobMasterScrapingQuery); | |
/************* standardize listing_data_source_url *************************/ | |
const jobDataSourceUrl = link; | |
//check is string and not empty | |
if ( stringExtractedIsInValid(jobDataSourceUrl) ) { | |
organicExtractData_LogIfValueIsInvalid("job details url on source website"); | |
return; | |
} | |
log("jobDataSourceUrl is : " + jobDataSourceUrl); | |
/************* standardize listing_title *************************/ | |
let jobPosition = position; | |
//check is string and not empty | |
if ( stringExtractedIsInValid(jobPosition) ) { | |
organicExtractData_LogIfValueIsInvalid("job details url on source website"); | |
return; | |
} | |
//fix issue #558 | |
//where huge string without any white space was breaking feed rendering | |
const longestWordInPosition = longestWord(jobPosition); | |
if ( longestWordInPosition.length > 30 ) { | |
jobPosition = jobPosition.replace(longestWordInPosition, longestWordInPosition.replace(/\//g, ' / '));//replace/ by ' / ' | |
} | |
//remove neutral words present when they indicate job position includes | |
//company name or cities | |
//remove everything after detection of the word | |
//not super modular, but we put here all the keywors in all languages | |
var excludeCity = new RegExp("basé à|based in", "gi");//case insensitive match | |
jobPosition = jobPosition.split(excludeCity)[0]; | |
let normalizedJobPosition = jobPosition | |
//.toLowerCase(); not required as would put some acronyms like "IT" into "it" | |
.trim() //remove any white space at the beginnign or the end of the string | |
.replace(/\s+/g, ' ');//remove any multiple whites spaces into only one white space | |
let jobPositionTest = getFirstNWordsInStr(normalizedJobPosition, 3); | |
if ( jobPositionTest == jobPositionTest.toUpperCase() ) { | |
//if true means the whole string is 100% uppercase in the first 3 words | |
//ex: https://www.indeed.fr/voir-emploi?jk=f5472bfe2b5a5f00 => "COMMERCIAL H/F" | |
//in that case, to enforce consistent formatting on critical listing feed between different job's positions format | |
//lowercase the string (with one loophole where inside the lower-cased string, you have a word | |
//which should stay uppercase such as IT, which will become "it" and then via css capitalize It... | |
//but it's the less bad option | |
//Note: why not check the whole string and only 3 words: | |
//This was leaving "BUSINESS DEVELOPMENT INTERN PARIS (M/F/D) GmbH" unhandled by above condition | |
normalizedJobPosition = normalizedJobPosition.toLowerCase(); | |
} | |
log("normalizedJobPosition is : " + normalizedJobPosition); | |
/************* set value for listing_location_restricted *************************/ | |
const jobLocationRestricted = false; | |
/************* standardize base_url_country *************************/ | |
let jobLocationCountry = base_url_country; | |
//check is string and not empty | |
if ( stringExtractedIsInValid(jobLocationCountry) ) { | |
organicExtractData_LogIfValueIsInvalid("base_url_country"); | |
return; | |
} | |
let normalizedjobLocationCountry = jobLocationCountry | |
.toLowerCase() | |
.trim() //remove any white space at the beginnign or the end of the string | |
.replace(/\s+/g, ' ');//remove any multiple whites spaces into only one white space | |
log("normalizedjobLocationCountry is : " + normalizedjobLocationCountry); | |
/************* set value for listing_location_country_iso3166 *************************/ | |
let countryToCountryCodeIso3166 = COUNTRY_CODE_ISO3166_MAPPING; | |
const jobLocationCountryIso3166 = countryToCountryCodeIso3166[normalizedjobLocationCountry]; | |
if (typeof jobLocationCountryIso3166 === 'undefined') { | |
log("there was a problem : we couldn't find the iso 3166 country code for " + | |
base_url_country + "inside functions-src/utils/countryNameToCountryIso3166Code.js"); | |
return; | |
} else { | |
log("jobLocationCountryIso3166 is : " + jobLocationCountryIso3166); | |
} | |
/************* standardize listing_location_city *************************/ | |
//check is string and not empty | |
if ( stringExtractedIsInValid(jobLocationCity) ) { | |
organicExtractData_LogIfValueIsInvalid("jobLocationCity"); | |
return; | |
} | |
let normalizedjobLocationCity = jobLocationCity | |
.toLowerCase() | |
.trim() //remove any white space at the beginnign or the end of the string | |
.replace(/\s+/g, ' ');//remove any multiple whites spaces into only one white space | |
log("normalizedjobLocationCity is : " + normalizedjobLocationCity); | |
/************* set the value of listing_location_city_slug *************************/ | |
let jobLocationCitySlug = slugifyStr(normalizedjobLocationCity); | |
log("jobLocationCitySlug is : " + jobLocationCitySlug); | |
/************* standardize tagsMatched *************************/ | |
let jobsTagArr = tagsMatched; | |
let normalizedJobsTagArr = jobsTagArr.filter(function(s) { | |
return !stringExtractedIsInValid(s); | |
}).map(function(s) { | |
return s | |
.toLowerCase() | |
.trim() //remove any white space at the beginningg or the end of the string | |
.replace(/\s+/g, ' '); //remove any multiple whites spaces into only one white space | |
}); | |
if (normalizedJobsTagArr.length < jobsTagArr.length) { | |
console.log('This is invalid: ', jobsTagArr); | |
} | |
log("normalizedJobsTagArr is:"); | |
log(normalizedJobsTagArr); | |
/************* set the value of listing_tags_slug *************************/ | |
let jobTagsSlugArr = normalizedJobsTagArr.map(function(s) { | |
return slugifyStr(s) | |
}); | |
log("jobTagsSlugArr is:"); | |
log(jobTagsSlugArr); | |
/************* set the value of listing_description *************************/ | |
let jobDescription = description; | |
//remove any occurence of the word "null" | |
//fixes issue #560 | |
jobDescription = jobDescription.replace(/ null /g, ' '); | |
log("jobDescription is : " + jobDescription); | |
/************* standardize cleanFinalApplyDestinationUrl *************************/ | |
let jobApplyUrl = cleanFinalApplyDestinationUrl; | |
//check is string and not empty | |
if ( stringExtractedIsInValid(jobApplyUrl) ) { | |
organicExtractData_LogIfValueIsInvalid("final destination url"); | |
return; | |
} | |
//note on process.env.url below | |
//theoretically, we should have used a conditional assignment like we do on /sitemaps-gen.js | |
//but here as we'll click often on links even while working, | |
//we'd rather refer trafic and have destination website know it's a clean for example | |
//aijobs.tech rather than a https://deploy-preview-296--zen-colden-2b17b5.netlify.com | |
//which is referring traffic to them. | |
const targetWebsiteRootUrl = getLwnRootDomain(process.env.URL); | |
const normalizedJobApplyUrl = addWebsiteNameTrackingParams(jobApplyUrl, targetWebsiteRootUrl); | |
log("normalizedJobApplyUrl is : " + normalizedJobApplyUrl); | |
/************* standardize content_entity_name *************************/ | |
//check is string and not empty | |
if ( stringExtractedIsInValid(companyName) ) { | |
organicExtractData_LogIfValueIsInvalid("companyName"); | |
return; | |
} | |
let normalizedCompanyName = companyName | |
.toLowerCase() | |
.trim() //remove any white space at the beginnign or the end of the string | |
.replace(/\s+/g, ' ');//remove any multiple whites spaces into only one white space | |
log("normalizedCompanyName is : " + normalizedCompanyName); | |
/************* set the value of content_entity_name_slug *************************/ | |
let companyNameSlug = slugifyStr(normalizedCompanyName); | |
log("companyNameSlug is : " + companyNameSlug); | |
/************* set the value of content_entity_initials *************************/ | |
//note: using companyNameSlug instead of normalizedCompanyName | |
//for multiple reasons, but especially don't want accents on initials | |
//special characters might mess our script to get initials and apostrophes | |
//like on l'oreal would make initial as l only (or would reuire js heavy lifting) | |
//So in a way it's great we can leverage at this point the fact we "rationalized" | |
//stuff with companyNameSlug | |
let companyInitials = companyNameSlug | |
.split("-") //extract words seperated by the - | |
.map((n)=>n[0]) //take the first letter of each word | |
.join("") | |
.slice(0,2); //limit to the first 2 initial letters | |
log("companyInitials is : " + companyInitials); | |
/************* set value for content_entity_domain *************************/ | |
let companyDomain; | |
if (companyCertainDomain) { | |
companyDomain = companyCertainDomain; | |
} else if ( !stringExtractedIsInValid(companyClearbitAutocompApproxDomain) ) {//best option after companyCertainDomain | |
companyDomain = companyClearbitAutocompApproxDomain; | |
//note: no need to sanitize/normalize as brought by Clearbit API which is already | |
//cleaning things any wonky cases for us | |
} else { | |
companyDomain = ''; | |
log("unfortunately we could not find the company domain for this job post (no biggie)"); | |
} | |
log("companyDomain is : " + companyDomain); | |
/************* set value for content_entity_logo_url *************************/ | |
let companyLogoUrl; | |
if (companyCertainLogoUrl) { | |
companyLogoUrl = companyCertainLogoUrl; | |
} else if (companyClearbitAutocomppeApproxLogoUrl) {//best option after companyCertainLogoUrl | |
companyLogoUrl = companyClearbitAutocomppeApproxLogoUrl; | |
//note: no need to sanitize/normalize as brought by Clearbit API which is already | |
//cleaning things any wonky cases for us | |
} else { | |
companyLogoUrl = ''; | |
log("unfortunately we could not find the company logo url for this job post (no biggie)"); | |
} | |
log("companyLogoUrl is : " + companyLogoUrl); | |
/************* set value for listing_duration *************************/ | |
//n/a no limit on organic listings | |
/************* set value for listing_type *************************/ | |
const listingType = "organic"; | |
log("listingType is : " + listingType); | |
/************* value for paid_listing_job_requirement *************************/ | |
//n/a | |
/************* value for paid_listing_job_responsibility *************************/ | |
//n/a | |
/************* value for paid_listing_job_application_instructions *************************/ | |
//n/a | |
/************* value for paid_listing_payment_id *************************/ | |
//n/a | |
/************* value for paid_listing_paid_amount *************************/ | |
//n/a | |
/************* value for paid_listing_payment_currency *************************/ | |
//n/a | |
/************* value for paid_listing_payment_status *************************/ | |
//n/a | |
/************* value for paid_listing_contact_email *************************/ | |
//security safeguard so that if destinaiton_url empty too the feature when destinaiton_url AND empty | |
//results in appearance on the Fe of a button "report broken link/empty link) | |
//if not empty, this feature could fail | |
//should be extremely rare as 99% of listings will anyway have a destination_url | |
const listingApplyEmail = ""; | |
log("listingApplyEmail is : " + listingApplyEmail); | |
/************* value for listing_url *************************/ | |
//not performed on Lambda level but on db level (on Baqend performed by "Modules") | |
//only solution to avoid concurrency issues and allow the "last mile"=the db | |
//to make sure it never sets the same value for 2 jobs (uniqueness achieved via | |
//job_offer_id, itself being set on db level | |
/* | |
move it to the top of file to return early and not do all these expensive noemziation | |
*/ | |
//check all baqend comuns and check if i have them: a lot will require normazation ex: slugs, ex initials... | |
//but only filter out/return if thoe other job with the same desitnaiton url | |
//is recent(<2 weeks ago) | |
/* Connect to Db and inject the above calculated values | |
Note: rejection of listings with a normalizedJobApplyUrl already present in an existing record | |
is only performed at db level by a Baqend module in order to stay as close as possible as db server | |
(for efficiency and concurrency aspects) | |
*/ | |
try { | |
await connect_to_baqend("listings-network"); | |
const response = await DB[DB_NAME]( | |
{ | |
target_website: WEBSITE_NAME_INCLUDING_DEV_MODE, | |
listing_category: 'j', | |
listing_master_scraping_query: jobMasterScrapingQuery, | |
listing_data_source_type: jobDataSourceType, | |
listing_data_source_url: jobDataSourceUrl, | |
listing_title: normalizedJobPosition, | |
listing_location_restricted: jobLocationRestricted, | |
listing_location_country: normalizedjobLocationCountry, | |
listing_location_country_iso3166: jobLocationCountryIso3166, | |
listing_location_city: normalizedjobLocationCity, | |
listing_location_city_slug: jobLocationCitySlug, | |
listing_tags: normalizedJobsTagArr, | |
listing_tags_slug: jobTagsSlugArr, | |
listing_description: jobDescription, | |
listing_destination_url: normalizedJobApplyUrl, | |
listing_apply_email: listingApplyEmail, | |
content_entity_name: normalizedCompanyName, | |
content_entity_name_slug: companyNameSlug, | |
content_entity_initials: companyInitials, | |
content_entity_domain: companyDomain, | |
content_entity_logo_url: companyLogoUrl, | |
listing_type: listingType | |
}).insert(); | |
log("we injected into the db the data of " + context.awsRequestId); | |
} catch (e) { | |
//note: if we have an error in the code inside Baqend handlers (ex: onInsert), | |
//then the error will be "conveyed"/transferred here below and we'll get | |
//the error message below | |
//note: do we really xwant this on dashbird: not sure!!! | |
//it can happen and then? is iut realyl serious?...that life | |
console.log('Something went wrong connecting to baqend', e) | |
} | |
return { | |
statusCode: 200, | |
body: 'Baqend: Inserted' | |
} | |
}); | |
export {handler}; |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment