Last active
March 7, 2024 11:35
-
-
Save mcarletti/3d9b6435566083deb4a2aa644920ab07 to your computer and use it in GitHub Desktop.
Parse main CVF conference titles and abstracts (eg, CVPR, ICCV)
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import sys | |
import numpy as np | |
import matplotlib.pyplot as plt | |
from wordcloud import WordCloud | |
filename = sys.argv[1] | |
print(filename) | |
text = open(filename, "r").read() | |
blacklist = [ | |
"et", "al", "al.", "fig", "figure", "section", "using", "used", "show", "results", "method", "proposed", "paper", "for", "and", "by", "in", "the", | |
"we", "our", "this", "that", "to", "of", "a", "an", "is", "are", "on", "with", "from", "as", "at", "it", "be", "can", "which", "has", "have", "been", | |
"learning", "via", "based", "toward", "towards", "over", "under", "above", "below", "between", "among", "within", "without", "across", "along", | |
] | |
wordcloud = WordCloud(max_font_size=120, width=1280, height=920, stopwords=blacklist).generate_from_text(text) | |
plt.figure() | |
plt.imshow(wordcloud, interpolation="bilinear") | |
plt.axis("off") | |
plt.tight_layout() | |
plt.show() |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import requests | |
from bs4 import BeautifulSoup | |
from tika import parser | |
from tqdm import tqdm | |
import numpy as np | |
def get_parsed_html(url): | |
res = requests.get(url) | |
html = res.content | |
parsed_html = BeautifulSoup(html, features="lxml") | |
return parsed_html | |
def extract_abstract(url): | |
html = get_parsed_html(url) | |
content = html.body.find("div", attrs={"id": "abstract"}) | |
if content is None: | |
return "" | |
return content.text[1:-1] | |
def extract_abstract_from_pdf(filename): | |
raw = parser.from_file(filename) | |
text = raw["content"] | |
abstract = text[text.find("Abstract")+8:text.find("1. Introduction")] | |
return abstract | |
if __name__ == "__main__": | |
parser = argparse.ArgumentParser() | |
parser.add_argument("--get_abstracts", action="store_true") | |
parser.add_argument("--conference_names", nargs="+", type=str, default=[]) | |
args = parser.parse_args() | |
titles = [] | |
abstracts = [] | |
root = "http://openaccess.thecvf.com/" | |
html = get_parsed_html(root + "menu.py") | |
data = html.body.findAll("dd") | |
for x in data: | |
conference_name = x.text[1:x.text.find("[")-1] | |
if conference_name.split(" ")[0] not in args.conference_names and len(args.conference_names) > 0: | |
print("Skipping:", conference_name) | |
continue | |
print("Processing:", conference_name) | |
conference_url = x.find("a")["href"] | |
html = get_parsed_html(root + conference_url + "?day=all") | |
content = html.body.findAll("dt", attrs={"class": "ptitle"}) | |
for x in tqdm(content): | |
paper_title = x.text | |
paper_url = x.find("a")["href"] | |
if args.get_abstracts: | |
paper_abstract = extract_abstract(root + paper_url) | |
# save title and abstract | |
titles.append( paper_title ) | |
if args.get_abstracts: | |
abstracts.append( paper_abstract ) | |
np.savetxt("cvf_titles.txt", titles, "%s") | |
if args.get_abstracts: | |
np.savetxt("cvf_abstracts.txt", abstracts, "%s") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment