docker build --rm --force-rm -t r.j3ss.co/scrape .
docker run --rm -it -v $(pwd)/results:/root/cia r.j3ss.co/scrape
Created
December 28, 2018 23:00
-
-
Save jessfraz/a1aa52c7755bdac0a7f8d9e6fe701aca to your computer and use it in GitHub Desktop.
Scrape CIA public PDF files
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/local/bin/python | |
import lxml.html | |
from lxml.html.clean import Cleaner | |
import requests | |
import os.path | |
import signal | |
import sys | |
# Handle control+C. | |
def signal_handler(sig, frame): | |
print('You pressed Ctrl+C! Exiting...') | |
sys.exit(0) | |
def lxmlize(url): | |
print("GET-ing", url) | |
resp = requests.get(url) | |
page = lxml.html.fromstring(resp.text) | |
page.make_links_absolute(url) | |
return page | |
def publications(url): | |
page = lxmlize(url) | |
for href in page.xpath("//div[@id='content-core']//a[contains(@href, 'csi-studies') and contains(@href, '.html')]"): | |
yield href | |
def articles(url): | |
page = lxmlize(url) | |
for el in page.xpath("//a[contains(@href, '.pdf')]"): | |
href = el.attrib['href'] | |
yield href, os.path.basename(href).replace("-", " ").replace(".pdf", "").replace("pdf", "").replace("%20", " ").title() | |
def main(page): | |
pdir = "Unclassified Extracts from Classified Studies" | |
parentDir = "cia/" + pdir | |
for publication in publications(page): | |
publication_name = unicode(publication.text_content()) | |
name = publication_name.replace(pdir, "").strip() | |
for href, text in articles(publication.attrib['href']): | |
full_path = "{}/{}/{}.pdf".format(parentDir, name, text) | |
if os.path.exists(full_path): | |
continue | |
print(full_path) | |
if not os.path.exists(parentDir+"/"+name): | |
os.makedirs(parentDir+"/"+name) | |
response = requests.get(href) | |
with open(full_path, 'wb') as fd: | |
for block in response.iter_content(1024): | |
fd.write(block) | |
signal.signal(signal.SIGINT, signal_handler) | |
main("https://www.cia.gov/library/center-for-the-study-of-intelligence") |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
FROM python:2-alpine | |
RUN apk add --no-cache \ | |
gcc \ | |
libxml2-dev \ | |
libxslt-dev \ | |
musl-dev | |
ENV LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/include/libxml2 | |
ENV C_INCLUDE_PATH=$C_INCLUDE_PATH:/usr/include/libxml2 | |
RUN pip install \ | |
lxml \ | |
requests | |
COPY cia.py /usr/local/bin/ | |
RUN chmod +x /usr/local/bin/cia.py | |
WORKDIR /root | |
CMD ["cia.py"] |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment