Last active
January 15, 2020 07:07
-
-
Save fnurl/586dbdb7d313f1911580ae873d5ad213 to your computer and use it in GitHub Desktop.
A script that produces a JSON page index file for markdown files (extension `.md`) in a directory and its subdirectories (e.g. a Hugo site's (https://gohugo.io/) `content` directory) for use with Algolia Docsearch (https://github.com/algolia/docsearch).
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import os | |
import sys | |
import yaml | |
import json | |
# base url to use | |
base_url = "http://localhost:1313" | |
# The attribute mapping for docsearch. | |
# | |
# The 'tags' mapping's value will be a list, so values from multiple | |
# taxonomies can be put into the docsearch `tags` property. | |
docsearch_mapping = { "content": "content", | |
"url": "url", | |
"tags": ["tags", "categories"] | |
} | |
# default values for the weight property | |
docsearch_weight = { "position": 1, | |
"level": 10, | |
"page_rank": 0 | |
} | |
# top hierarchy level. Sections will be used for additional | |
# levels | |
base_level = "Hugo Site" | |
def parse_md(filepath): | |
"""Parse a markdown file. Reads yaml front matter.""" | |
yaml_string = "" | |
in_yaml = None | |
content = "" | |
with open(filepath) as datafile: | |
for line in datafile: | |
if line.startswith("---"): | |
if in_yaml: | |
in_yaml = False | |
else: | |
in_yaml = True | |
continue | |
elif in_yaml == True: | |
yaml_string += line | |
else: | |
content += line | |
md_data = yaml.load(yaml_string) | |
if not "content" in md_data.keys(): | |
md_data["content"] = content | |
else: | |
sys.stderr.write("ERROR: Could not store content for '" + filepath + "'\n") | |
return md_data | |
def create_index_list(walk_dir): | |
"""Create a list of index entries starting from the directory walk_dir""" | |
global base_level, base_url, docsearch_mapping, docsearch_weight | |
# used to store all indexed item (markdown files) | |
index_list = [] | |
# give items an objectID so that the index file can be uploaded | |
# to algolia again, overwriting previous index items | |
objectID = 0 | |
for root, subdirs, files in os.walk(walk_dir): | |
for filename in files: | |
# index md files | |
if filename.endswith(".md"): | |
objectID += 1 | |
filepath = os.path.join(root, filename) | |
subpath = root[len(walk_dir):].rstrip(os.sep) | |
subpaths = subpath.lstrip(os.sep).split(os.sep) | |
# index.md have special URLs | |
if filename != "index.md": | |
subpaths.append(filename[:-3]) | |
# set up list for the hierarchy of the markdown file | |
hierarchy_list = [base_level] | |
hierarchy_list.extend(subpaths) | |
# construct the url of the markdown file | |
url_subpath = "/".join(subpaths) | |
url = base_url + "/" + url_subpath + "/" | |
sys.stderr.write("Indexing '" + filepath + "' (" + url + "\n") | |
# get data from the file (frontmatter and content) | |
filedata = parse_md(filepath) | |
# create index entry | |
indexed_item = {'objectID': objectID, 'url': url } | |
# map filedata to docsearch structure | |
for docsearch_key, filedata_key in docsearch_mapping.items(): | |
# plain mappings, configured at the top of the script | |
if type(filedata_key) == str and filedata_key in filedata.keys(): | |
indexed_item[docsearch_key] = filedata[filedata_key] | |
# if the mapping value is a list, assume that the frontmatter data | |
# of the keys in the list are also lists. Combine the lists values of each | |
# frontmatter property into a list and set the docsearch property as | |
# to this combined value list (used for the "tags" property. see | |
# comment in the beginning of the script | |
elif type(filedata_key) == list: | |
aggregated = [] | |
for filedata_subkey in filedata_key: | |
if filedata_subkey in filedata.keys(): | |
aggregated.extend(filedata[filedata_subkey]) | |
indexed_item[docsearch_key] = aggregated | |
# hierarchy and hierarchy_complete properties | |
hierarchy = create_empty_hierarchy() | |
hierarchy_complete = create_empty_hierarchy() | |
for level in range(7): | |
if level < len(hierarchy_list): | |
hierarchy["lvl" + str(level)] = hierarchy_list[level] | |
hierarchy_complete["lvl" + str(level)] = " > ".join(hierarchy_list[:level]) | |
indexed_item["hierarchy"] = hierarchy | |
indexed_item["hierarchy_complete"] = hierarchy_complete | |
# hierarchy_radio and type | |
hierarchy_radio = create_empty_hierarchy() | |
max_lvl = len(subpaths) - 1 | |
hierarchy_radio["lvl" + str(max_lvl)] = subpaths[max_lvl] | |
indexed_item["hierarchy_radio"] = hierarchy_radio | |
indexed_item["type"] = "lvl" + str(max_lvl) | |
# anchor and weight. anchors are not considered | |
indexed_item["anchor"] = None | |
indexed_item["weight"] = docsearch_weight | |
index_list.append(indexed_item) | |
sys.stderr.write("Done indexing .md files in '" + walk_dir + "'" + "\n") | |
return index_list | |
def create_empty_hierarchy(): | |
"""Create a empty hierarchy structure (dict).""" | |
empty_hierarchy = {} | |
for level_index in range(7): | |
empty_hierarchy["lvl" + str(level_index)] = None | |
return empty_hierarchy | |
if __name__ == '__main__': | |
if len(sys.argv) != 2: | |
sys.stderr.write("ERROR: Please supply a path to the dir to be indexed.") | |
sys.exit(1) | |
# gather index data | |
index_list = create_index_list(sys.argv[1]) | |
# output the index as readable json to stdout. does not escape UTF-8 characters | |
sys.stdout.write(json.dumps(index_list, ensure_ascii=False, indent=2)) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
If you use Hugo, you can set your site up by editing your
config.yaml
to contain your Algolia App ID, API key and the index you want to use as follows:Then use the following HTML in your template
docsearch script and css:
snippet for the search box: