Skip to content

Instantly share code, notes, and snippets.

@alexbrasetvik
Created November 13, 2013 12:20
Show Gist options
  • Save alexbrasetvik/7448181 to your computer and use it in GitHub Desktop.
Save alexbrasetvik/7448181 to your computer and use it in GitHub Desktop.
Using an edgeNGram index analyzer and a simpler search analyzer for partial filename purposes.
# Switch to the Analysis-tab to see how these samples get analyzed.
text:
- My_second_file_created_at_2012.01.13.pdf
- Another file.txt
analyzer:
filename_index:
type: custom
tokenizer: filename
filter:
- lowercase
- edge_ngram
filename_search:
type: custom
tokenizer: filename
filter:
- lowercase
tokenizer:
filename:
type: pattern
pattern: "[^\\p{L}\\d]+"
filter:
edge_ngram:
type: edge_ngram
side: front
min_gram: 1
max_gram: 20
_type: file
filename: My_first_file_created_at_2012.01.13.doc
---
_type: file
filename: My_second_file_created_at_2012.02.01.pdf
---
_type: file
filename: Another file.txt
---
_type: file
filename: And_again_another_file.docx
---
_type: file
filename: foo.bar.txt
file:
properties:
filename:
type: string
index_analyzer: filename_index
search_analyzer: filename_search
#!/bin/bash
export ELASTICSEARCH_ENDPOINT="http://localhost:9200"
# Create indexes
curl -XPUT "$ELASTICSEARCH_ENDPOINT/play" -d '{
"settings": {
"analysis": {
"text": [
"My_second_file_created_at_2012.01.13.pdf",
"Another file.txt"
],
"analyzer": {
"filename_index": {
"type": "custom",
"tokenizer": "filename",
"filter": [
"lowercase",
"edge_ngram"
]
},
"filename_search": {
"type": "custom",
"tokenizer": "filename",
"filter": [
"lowercase"
]
}
},
"tokenizer": {
"filename": {
"type": "pattern",
"pattern": "[^\\p{L}\\d]+"
}
},
"filter": {
"edge_ngram": {
"type": "edge_ngram",
"side": "front",
"min_gram": 1,
"max_gram": 20
}
}
}
},
"mappings": {
"file": {
"properties": {
"filename": {
"type": "string",
"index_analyzer": "filename_index",
"search_analyzer": "filename_search"
}
}
}
}
}'
# Index documents
curl -XPOST "$ELASTICSEARCH_ENDPOINT/_bulk?refresh=true" -d '
{"index":{"_index":"play","_type":"file"}}
{"filename":"My_first_file_created_at_2012.01.13.doc"}
{"index":{"_index":"play","_type":"file"}}
{"filename":"My_second_file_created_at_2012.02.01.pdf"}
{"index":{"_index":"play","_type":"file"}}
{"filename":"Another file.txt"}
{"index":{"_index":"play","_type":"file"}}
{"filename":"And_again_another_file.docx"}
{"index":{"_index":"play","_type":"file"}}
{"filename":"foo.bar.txt"}
'
# Do searches
curl -XPOST "$ELASTICSEARCH_ENDPOINT/_search?pretty" -d '
{
"query": {
"match": {
"filename": {
"query": "fi"
}
}
}
}
'
curl -XPOST "$ELASTICSEARCH_ENDPOINT/_search?pretty" -d '
{
"query": {
"bool": {
"should": [
{
"match": {
"filename": {
"query": 2012.01
}
}
},
{
"match_phrase": {
"filename": {
"query": 2012.01
}
}
}
]
}
}
}
'
# Auto generated by Found's Play-tool at 2013-11-13T13:20:26+01:00
version: 0
title: Partial filename search
description: Using an edgeNGram index analyzer and a simpler search analyzer for partial filename purposes.
# Example to match partial filenames, without being fuzzy.
# Press Ctrl+Enter to execute when you have made changes.
# Click "Help" in the upper right corner for more information.
# This example is due to Clinton Gormley:
# http://stackoverflow.com/questions/9421358/filename-search-with-elasticsearch/9432450#9432450
query:
match:
filename:
query: fi
---
query:
bool:
should:
# This will match both "2012.01.13" and "2012.02.01", which is okay.
- match:
filename:
query: 2012.01
# ... but "2012.01.13" is a better match:
- match_phrase:
filename:
query: 2012.01
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment