Created
March 6, 2024 16:27
-
-
Save j40903272/25af37c1869975584aa8cf65db704081 to your computer and use it in GitHub Desktop.
Wikipedia entity search
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import requests | |
import numpy as np | |
from collections import Counter | |
#from googlesearch import search as google_search | |
import wikipedia as wikipedia_api | |
DEFAULT_IGNORED_NS = ('wikipedia:', 'file:', 'portal:', 'template:', 'mediawiki:', 'user:', | |
'help:', 'book:', 'draft:', 'module:', 'timedtext:') | |
def _normalize_title(title): | |
output = "" | |
if len(title) > 0: | |
output += title[0].upper() | |
if len(title) > 1: | |
output += title[1:].replace('_', ' ') | |
return output | |
# https://developers.google.com/knowledge-graph | |
def google_entity_search(query, **kwargs): | |
key = "AIzaSyCLpGT1fyIPqVgkbSxN8X75Pd5bDduxHmA" | |
service_url = 'https://kgsearch.googleapis.com/v1/entities:search' | |
params = { | |
'query': query, | |
'limit': 1, | |
'indent': True, | |
'key': key, | |
} | |
params.update(**kwargs) | |
response = requests.get(service_url, params=params).json() | |
for element in response['itemListElement']: | |
print(element['result']['name']) | |
print(element['result']['description']) | |
print(element['result']['detailedDescription']) | |
print(element['result']['@type']) | |
print(element['resultScore']) | |
def google_search(query, **kwargs): | |
key = "AIzaSyCLpGT1fyIPqVgkbSxN8X75Pd5bDduxHmA" | |
lang = "lang_en" | |
params = { | |
'q':query, | |
"key":key, | |
'lr':lang | |
} | |
params.update(**kwargs) | |
# restricted to wikipedia only | |
cx = "009719884197649911529:2nyd4xipros" | |
service_url = "https://www.googleapis.com/customsearch/v1/siterestrict" | |
params['cx'] = cx | |
response = requests.get(service_url, params=params).json() | |
if 'items' in response: | |
return [item['link'] for item in response['items']] | |
# search all | |
cx = "009719884197649911529:f5yfirrdx7o" | |
service_url = "https://www.googleapis.com/customsearch/v1" | |
params['cx'] = cx | |
response = requests.get(service_url, params=params).json() | |
if 'items' in response: | |
return [item['link'] for item in response['items']] | |
return [] | |
def wikipedia_search(query, **kwargs): | |
wiki_results = Counter() | |
search = wikipedia_api.search(keyword, results=3) | |
if search: | |
for candidate in search: | |
page = wikipedia_api.page(candidate, auto_suggest=False) | |
if any([page.title.lower().startswith(ns) for ns in DEFAULT_IGNORED_NS]): | |
continue | |
wiki_results[_normalize_title(page.title)] += 1 | |
return wiki_results | |
def bfs(seed: List[str], breadth: int = 3, topk: int = 3): | |
queue = seed | |
entities = set(seed) | |
for _ in range(breadth): | |
next_queue = set() | |
for keyword in queue: | |
search_result = wikipedia_api.search(keyword, results=topk) | |
for candidate in search: | |
page = wikipedia_api.page(candidate, auto_suggest=False) | |
if any([page.title.lower().startswith(ns) for ns in DEFAULT_IGNORED_NS]): | |
continue | |
for i in page.links: | |
next_queue.add(i) | |
entities |= next_queue | |
queue = next_queue | |
return entities |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment