Last active
May 14, 2022 23:15
-
-
Save pfeilbr/24e18abf1ae051a90589ae16488cadb8 to your computer and use it in GitHub Desktop.
fetch all aws directory api metadata (arch diagrams, products, blog posts, builders library articles, etc.)
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// fetch all aws directory api metadata | |
(async () => { | |
const directories = [ | |
{ | |
"directoryId": "event-content" | |
}, | |
{ | |
"directoryId": "amazon-redwood" | |
}, | |
{ | |
"directoryId": "aws-products" | |
}, | |
{ | |
"directoryId": "free-tier-products" | |
}, | |
{ | |
"directoryId": "blog-posts" | |
}, | |
{ | |
"directoryId": "whats-new" | |
}, | |
{ | |
"directoryId": "security-bulletins" | |
}, | |
{ | |
"directoryId": "media-resources" | |
} | |
]; | |
const l = (o) => { | |
console.log(JSON.stringify(o, null, 2)) | |
} | |
const sleep = ms => { | |
return new Promise(resolve => setTimeout(resolve, ms)) | |
} | |
const fetchJSON = async (url) => { | |
//l(`fetchJSON("${url}")`) | |
const resp = await fetch(url) | |
const data = await resp.json(); | |
return data; | |
} | |
const fetchDirectoryMetadata = async (directoryId) => { | |
const metadataURL = `https://aws.amazon.com/api/dirs/items/search?item.directoryId=${directoryId}&item.locale=en_US`; | |
const data = await fetchJSON(metadataURL) | |
return data; | |
} | |
const fetchDirectoryContent = async (directoryId, metadata) => { | |
const urlTemplate = `https://aws.amazon.com/api/dirs/items/search?item.directoryId=${directoryId}&size=${metadata.metadata.count}&item.locale=en_US&page=`; | |
const pageIndexes = Array.from(Array(metadata.metadata.pageCount).keys()) | |
const pages = []; | |
for (const pageIndex of pageIndexes) { | |
const url = urlTemplate + `${pageIndex}` | |
const data = await fetchJSON(url); | |
if (data.items.length > 0) { | |
pages.push(data); | |
} else if (pageIndex < pageIndexes[pageIndexes.length-1]) { // appears max page is 1000. this is based on "directoryId=blog-posts,action=break,pageIndex=1000,pageIndexes.length=2077"" | |
l(`directoryId=${directoryId},action=break,pageIndex=${pageIndex},pageIndexes.length=${pageIndexes.length}`) | |
break; | |
} | |
} | |
return pages; | |
} | |
const fetchDirectory = async (directoryId) => { | |
const metadata = await fetchDirectoryMetadata(directoryId) | |
const pageCount = Math.ceil(metadata.metadata.totalHits / metadata.metadata.count); | |
metadata.metadata.pageCount = pageCount; | |
const data = await fetchDirectoryContent(directoryId, metadata); | |
//l(metadata) | |
return data; | |
} | |
const saveFile = async () => { | |
// create a new handle | |
const newHandle = await window.showSaveFilePicker(); | |
// create a FileSystemWritableFileStream to write to | |
const writableStream = await newHandle.createWritable(); | |
const obj = {hello: 'world'}; | |
const blob = new Blob([JSON.stringify(obj, null, 2)], {type : 'application/json'}); | |
// write our file | |
await writableStream.write(blob); | |
// close the file and write the contents to disk. | |
await writableStream.close(); | |
} | |
const main = async () => { | |
//await saveFile(); | |
//return; | |
try { | |
for (const directory of directories.slice(0,1)) { | |
const data = await fetchDirectory(directory.directoryId) | |
const totalItems = data.reduce((previous, current) => { | |
return previous + current.items.length; | |
}, 0) | |
l(`directory.directoryId=${directory.directoryId},totalPages=${data.length},totalItems=${totalItems},itemsPerPage=${data[0].metadata.count}`); | |
//await sleep(1000) | |
} | |
} catch (e) { | |
console.log(e) | |
} | |
} | |
await main(); | |
})(); | |
/* | |
var params = { | |
Bucket: 'STRING_VALUE', /* required */ | |
ContinuationToken: 'STRING_VALUE', | |
Delimiter: 'STRING_VALUE', | |
EncodingType: url, | |
ExpectedBucketOwner: 'STRING_VALUE', | |
FetchOwner: true || false, | |
MaxKeys: 'NUMBER_VALUE', | |
Prefix: 'STRING_VALUE', | |
RequestPayer: requester, | |
StartAfter: 'STRING_VALUE' | |
}; | |
s3.listObjectsV2(params, function(err, data) { | |
if (err) console.log(err, err.stack); // an error occurred | |
else console.log(data); // successful response | |
}); | |
*/ | |
/* | |
const oldMain = async () => { | |
const baseURL = `https://aws.amazon.com/api/dirs/items/search?item.directoryId=whitepapers&sort_by=item.additionalFields.sortDate&sort_order=desc&size=9&item.locale=en_US&tags.id=GLOBAL%23content-type%23reference-arch-diagram&page=` | |
let page = 0; | |
let more = true; | |
let items = []; | |
while (more) { | |
const resp = await fetch(baseURL + `${page}`) | |
const data = await resp.json(); | |
console.log(data); | |
more = data.metadata.count > 0; | |
if (more) { | |
items.push(...data.items); | |
page++; | |
} | |
} | |
console.log(`pages=${page},items.length=${items.length}`); | |
const output = JSON.stringify(items, null, 2); | |
console.log(output); | |
} | |
*/ | |
/* | |
# TODO | |
* all blogs (<https://aws.amazon.com/blogs/>) | |
* template URL - "https://aws.amazon.com/api/dirs/items/search?item.directoryId=blog-posts&sort_by=item.additionalFields.createdDate&sort_order=desc&size=10&item.locale=en_US&page=1" | |
* all events content (<https://aws.amazon.com/events/events-content>) page=0...N | |
* template URL "https://aws.amazon.com/api/dirs/items/search?item.directoryId=event-content&sort_by=item.dateCreated&sort_order=desc&size=12&item.locale=en_US&tags.id=GLOBAL%23language%23english&page=1" | |
* Builders Library - https://aws.amazon.com/api/dirs/items/search?item.directoryId=amazon-redwood&sort_by=item.additionalFields.customSort&sort_order=asc&size=24&item.locale=en_US | |
* whats new, item.directoryId=whats-new | |
* item.directoryId=security-bulletins | |
* item.directoryId=aws-products | |
* item.directoryId=blog-posts | |
* item.directoryId=media-resources | |
* item.directoryId=free-tier-products | |
* use <https://lunrjs.com/> for searching | |
* step fn processing logic - need to figure out how to not download everything each run. way to download only new or changed items since last run | |
* define work by getting metadata for number results via https://...?item.directoryId=${directoryId}&item.locale=en_US&page=0. generate singe sqs message for each unique URL | |
* use returned `metadata.count` for `size` query string parameter | |
* &sort_by=item.[dateCreated|dateUpdated]&sort_order=desc | |
* lambda subscription to SQS. process sequencially. set batch size to >1 initially to see if throttling. can always set batch size to 1 | |
--- | |
```sh | |
# source: <https://gist.github.com/garystafford/37442d8fd8dde388f50856c6a2900b0d> | |
# One-liner to retrieve a list of all AWS products from aws.amazon.com/products sorted by product category (requires jq). Worked as of 2022-01-03. Page format tends to change a lot... | |
curl --silent --compressed \ | |
'https://aws.amazon.com/api/dirs/items/search?item.directoryId=aws-products&sort_by=item.additionalFields.productCategory&sort_order=asc&size=500&item.locale=en_US' \ | |
| jq -r '.items[].item | .additionalFields.productCategory + " | " + .additionalFields.productName' \ | |
| sort | |
``` | |
* <https://github.com/tycarac/aws-documents> - good refernce project that "Downloads AWS documents, currently whitepapers, from AWS documentation website." | |
* <https://github.com/nragusa/aws-newrelease-slack> - An AWS CDK application that sends AWS new service and feature release announcements to a Slack channel of your choice | |
querying various content types in the directory by tags.id querystring value | |
"contentType": "AWS Solution", tags.id=GLOBAL#content-type#solution | |
"contentType": "Pattern", tags.id=GLOBAL%23content-type%23pattern | |
"contentType": "Reference Architecture Diagram", tags.id=GLOBAL%23content-type%23reference-arch-diagram | |
"contentType": "Guide", tags.id=GLOBAL%23content-type%23tech-guide | |
"contentType": multi-valued, tags.id=GLOBAL%23content-type%23video | |
"contentType": "Whitepaper", tags.id=GLOBAL%23content-type%23whitepaper | |
-- general response shape | |
{ | |
"items": [], | |
"metadata": { | |
"count": 0, | |
"totalHits": 299 | |
}, | |
"fieldTypes": { | |
"updateDate": "Date", | |
"imageSrcUrl": "URL", | |
"featureFlag": "Text", | |
"description": "LongText", | |
"sortDate": "Date", | |
"docTitle": "Text", | |
"primaryURL": "URL", | |
"datePublished": "Date", | |
"publishedText": "Text", | |
"footerInfoSubtext": "Text", | |
"subHeadline": "Text", | |
"enableShare": "Boolean", | |
"category": "Text", | |
"contentType": "Text" | |
} | |
} | |
--- example item (items[0].item) | |
{ | |
"item": { | |
"id": "whitepapers#image-moderation-chatbot", | |
"locale": "en_US", | |
"directoryId": "whitepapers", | |
"name": "image-moderation-chatbot", | |
"author": "julicoll", | |
"createdBy": "julicoll", | |
"lastUpdatedBy": "julicoll", | |
"numImpressions": 0, | |
"score": 0, | |
"dateCreated": "2019-06-25T17:21:57+0000", | |
"dateUpdated": "2021-07-29T17:01:46+0000", | |
"additionalFields": { | |
"datePublished": "2018-12-05", | |
"publishedText": "December 2018", | |
"description": "Shows you how to build a serverless chatbot on AWS that monitors your chat channels and removes images containing suggestive or explicit content.<p><a href=\"https://github.com/awslabs/lambda-refarch-imagemoderationchatbot?did=wp_card&trk=wp_card\" target=\"_blank\" rel=\"noopener\">Code</a></p><p class=\"m-subheadline\">Media Services | Serverless</p>", | |
"docTitle": "Image Moderation Chatbot", | |
"sortDate": "2018-12-05", | |
"enableShare": "1", | |
"contentType": "Reference Architecture Diagram", | |
"primaryURL": "https://github.com/awslabs/lambda-refarch-imagemoderationchatbot?did=wp_card&trk=wp_card" | |
} | |
}, | |
"tags": [ | |
{ | |
"id": "GLOBAL#content-type#reference-arch-diagram", | |
"locale": "en_US", | |
"tagNamespaceId": "GLOBAL#content-type", | |
"name": "Reference Architecture Diagram", | |
"description": "Reference Architecture Diagram", | |
"createdBy": "jenbar", | |
"lastUpdatedBy": "jenbar", | |
"dateCreated": "2020-04-29T05:19:31+0000", | |
"dateUpdated": "2022-02-03T03:31:09+0000" | |
}, | |
{ | |
"id": "GLOBAL#methodology#serverless", | |
"locale": "en_US", | |
"tagNamespaceId": "GLOBAL#methodology", | |
"name": "Serverless", | |
"description": "Serverless", | |
"createdBy": "jenbar", | |
"lastUpdatedBy": "jenbar", | |
"dateCreated": "2020-06-05T07:06:34+0000", | |
"dateUpdated": "2022-02-03T03:32:11+0000" | |
}, | |
{ | |
"id": "GLOBAL#tech-category#media-services", | |
"locale": "en_US", | |
"tagNamespaceId": "GLOBAL#tech-category", | |
"name": "Media Services", | |
"description": "Media Services", | |
"createdBy": "jarfaa", | |
"lastUpdatedBy": "jenbar", | |
"dateCreated": "2020-07-17T03:06:10+0000", | |
"dateUpdated": "2022-02-03T03:35:28+0000" | |
} | |
] | |
} | |
` | |
*/ | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment