demo
standard
- The standard tokenizer provides grammar based tokenization (based on the Unicode Text Segmentation algorithm, as specified in Unicode Standard Annex #29) and works well for most languages.
PUT standard_tokenizer_index
{
"settings": {
"analysis": {
"analyzer": {
"my_analyzer": {
"tokenizer": "my_tokenizer"
}
},
"tokenizer": {
"my_tokenizer": {
"type": "standard",
"max_token_length": 5
}
}
}
}
}
POST standard_tokenizer_index/_analyze
{
"analyzer": "my_analyzer",
"text": "The 2 QUICK Brown-Foxes jumped over the lazy dog's bone."
}
letter
- The letter tokenizer breaks text into terms whenever it encounters a character which is not a letter.
POST _analyze
{
"tokenizer": "letter",
"text": "The 2 QUICK Brown-Foxes jumped over the lazy dog's bone."
}
lowercase
- The lowercase tokenizer, like the letter tokenizer breaks text into terms whenever it encounters a character which is not a letter, but it also lowercases all terms.
POST _analyze
{
"tokenizer": "lowercase",
"text": "The 2 QUICK Brown-Foxes jumped over the lazy dog's bone."
}
whitespace
- The whitespace tokenizer breaks text into terms whenever it encounters a whitespace character.
POST _analyze
{
"tokenizer": "whitespace",
"text": "The 2 QUICK Brown-Foxes jumped over the lazy dog's bone."
}
uax_url_email
- The uax_url_email tokenizer is like the standard tokenizer except that it recognises URLs and email addresses as single tokens.
POST _analyze
{
"tokenizer": "uax_url_email",
"text": "my home page is www.zhengcj01.com and the email is zhengcj01@test.com"
}
DELETE uax_url_email_tokenizer_index
PUT uax_url_email_tokenizer_index
{
"settings": {
"analysis": {
"analyzer": {
"my_analyzer": {
"tokenizer": "my_tokenizer"
}
},
"tokenizer": {
"my_tokenizer": {
"type": "uax_url_email",
"max_token_length":50
}
}
}
}
}
POST uax_url_email_tokenizer_index/_analyze
{
"analyzer": "my_analyzer",
"text": "my home page is www.zhengcj01.com and the email is zhengcj01@test.com"
}
classic
- It splits words at most punctuation characters, removing punctuation. However, a dot that’s not followed by whitespace is considered part of a token.
- It splits words at hyphens, unless there’s a number in the token, in which case the whole token is interpreted as a product number and is not split.
- It recognizes email addresses and internet hostnames as one token.
POST _analyze
{
"tokenizer": "classic",
"text": "The 2 QUICK Brown-Foxes jumped over the lazy dog's bone."
}
PUT classic_tokenizer_index
{
"settings": {
"analysis": {
"analyzer": {
"my_analyzer": {
"tokenizer": "my_tokenizer"
}
},
"tokenizer": {
"my_tokenizer": {
"type": "classic",
"max_token_length": 5
}
}
}
}
}
POST classic_tokenizer_index/_analyze
{
"analyzer": "my_analyzer",
"text": "The 2 QUICK Brown-Foxes jumped over the lazy dog's bone."
}
ngram
- The ngram tokenizer first breaks text down into words whenever it encounters one of a list of specified characters, then it emits N-grams of each word of the specified length.
POST _analyze
{
"tokenizer": "ngram",
"text": "Quick Fox"
}
PUT ngram_tokenizer_index
{
"settings": {
"analysis": {
"analyzer": {
"my_analyzer": {
"tokenizer": "my_tokenizer"
}
},
"tokenizer": {
"my_tokenizer": {
"type": "ngram",
"min_gram": 3,
"max_gram": 3,
"token_chars": [
"letter",
"digit"
]
}
}
}
}
}
POST ngram_tokenizer_index/_analyze
{
"analyzer": "my_analyzer",
"text": "2 Quick Foxes."
}
edge_ngram
- The edge_ngram tokenizer first breaks text down into words whenever it encounters one of a list of specified characters, then it emits N-grams of each word where the start of the N-gram is anchored to the beginning of the word.
POST _analyze
{
"tokenizer": "edge_ngram",
"text": "Quick Fox"
}
PUT edge_ngram_tokenizer_index
{
"settings": {
"analysis": {
"analyzer": {
"my_analyzer": {
"tokenizer": "my_tokenizer"
}
},
"tokenizer": {
"my_tokenizer": {
"type": "edge_ngram",
"min_gram": 2,
"max_gram": 10,
"token_chars": [
"letter",
"digit"
]
}
}
}
}
}
POST edge_ngram_tokenizer_index/_analyze
{
"analyzer": "my_analyzer",
"text": "2 Quick Foxes."
}
PUT autocomplete_index
{
"settings": {
"analysis": {
"analyzer": {
"autocomplete": {
"tokenizer": "autocomplete",
"filter": [
"lowercase"
]
},
"autocomplete_search": {
"tokenizer": "lowercase"
}
},
"tokenizer": {
"autocomplete": {
"type": "edge_ngram",
"min_gram": 2,
"max_gram": 10,
"token_chars": [
"letter"
]
}
}
}
},
"mappings": {
"properties": {
"title": {
"type": "text",
"analyzer": "autocomplete",
"search_analyzer": "autocomplete_search"
}
}
}
}
PUT autocomplete_index/_doc/1
{
"title": "Quick Foxes"
}
POST autocomplete_index/_refresh
GET autocomplete_index/_search
{
"query": {
"match": {
"title": {
"query": "Quick Fo",
"operator": "and"
}
}
}
}
GET autocomplete_index/_analyze
{
"field": "title",
"text":"Quick Foxes"
}
GET autocomplete_index/_analyze
{
"field": "title",
"text":"Quick Fo"
}
keyword
POST _analyze
{
"tokenizer": "keyword",
"text": "New York"
}
pattern
POST _analyze
{
"tokenizer": "pattern",
"text": "The foo_bar_size's default is 5."
}
PUT pattern_tokenizer_index_v1
{
"settings": {
"analysis": {
"analyzer": {
"my_analyzer": {
"tokenizer": "my_tokenizer"
}
},
"tokenizer": {
"my_tokenizer": {
"type": "pattern",
"pattern": ","
}
}
}
}
}
POST pattern_tokenizer_index_v1/_analyze
{
"analyzer": "my_analyzer",
"text": "comma,separated,values"
}
# "((?:\\"|[^"]|\\")*)": In the next example, we configure the pattern tokenizer to capture values enclosed in double quotes (ignoring embedded escaped quotes \").
PUT pattern_tokenizer_index_v2
{
"settings": {
"analysis": {
"analyzer": {
"my_analyzer": {
"tokenizer": "my_tokenizer"
}
},
"tokenizer": {
"my_tokenizer": {
"type": "pattern",
"pattern": "\"((?:\\\\\"|[^\"]|\\\\\")+)\"",
"group": 1
}
}
}
}
}
POST pattern_tokenizer_index_v2/_analyze
{
"analyzer": "my_analyzer",
"text": "\"value\", \"value with embedded \\\" quote\""
}
char_group
- The char_group tokenizer breaks text into terms whenever it encounters a character which is in a defined set. It is mostly useful for cases where a simple custom tokenization is desired, and the overhead of use of the pattern tokenizer is not acceptable.
POST _analyze
{
"tokenizer": {
"type": "char_group",
"tokenize_on_chars": [
"whitespace",
"-",
"\n"
]
},
"text": "The QUICK brown-fox"
}
simple_pattern
PUT simple_pattern_tokenizer_index
{
"settings": {
"analysis": {
"analyzer": {
"my_analyzer": {
"tokenizer": "my_tokenizer"
}
},
"tokenizer": {
"my_tokenizer": {
"type": "simple_pattern",
"pattern": "[0123456789]{3}"
}
}
}
}
}
POST simple_pattern_tokenizer_index/_analyze
{
"analyzer": "my_analyzer",
"text": "fd-786-335-514-12-x"
}
simple_pattern_split
PUT simple_pattern_split_tokenizer_index
{
"settings": {
"analysis": {
"analyzer": {
"my_analyzer": {
"tokenizer": "my_tokenizer"
}
},
"tokenizer": {
"my_tokenizer": {
"type": "simple_pattern_split",
"pattern": "_"
}
}
}
}
}
POST simple_pattern_split_tokenizer_index/_analyze
{
"analyzer": "my_analyzer",
"text": "an_underscored_phrase"
}
DELETE simple_pattern_split_tokenizer_index_v2
PUT simple_pattern_split_tokenizer_index_v2
{
"settings": {
"analysis": {
"analyzer": {
"my_analyzer": {
"tokenizer": "my_tokenizer"
}
},
"tokenizer": {
"my_tokenizer": {
"type": "simple_pattern_split",
"pattern": "[0123456789]{3}|-"
}
}
}
}
}
POST simple_pattern_split_tokenizer_index_v2/_analyze
{
"analyzer": "my_analyzer",
"text": "fd-786-335-514-12-x"
}
path_hierarchy
#
# path_hierarchy
#
POST _analyze
{
"tokenizer": "path_hierarchy",
"text": "/one/two/three"
}
DELETE path_hierarchy_tokenizer_index
PUT path_hierarchy_tokenizer_index
{
"settings": {
"analysis": {
"analyzer": {
"my_analyzer": {
"tokenizer": "my_tokenizer"
}
},
"tokenizer": {
"my_tokenizer": {
"type": "path_hierarchy",
"delimiter": "-",
"replacement": "/",
"skip": 2,
"reverse":true
}
}
}
}
}
POST path_hierarchy_tokenizer_index/_analyze
{
"analyzer": "my_analyzer",
"text": "one-two-three-four-five"
}
PUT file-path-test
{
"settings": {
"analysis": {
"analyzer": {
"custom_path_tree": {
"tokenizer": "custom_hierarchy"
},
"custom_path_tree_reversed": {
"tokenizer": "custom_hierarchy_reversed"
}
},
"tokenizer": {
"custom_hierarchy": {
"type": "path_hierarchy",
"delimiter": "/"
},
"custom_hierarchy_reversed": {
"type": "path_hierarchy",
"delimiter": "/",
"reverse": "true"
}
}
}
},
"mappings": {
"properties": {
"file_path": {
"type": "text",
"fields": {
"tree": {
"type": "text",
"analyzer": "custom_path_tree"
},
"tree_reversed": {
"type": "text",
"analyzer": "custom_path_tree_reversed"
}
}
}
}
}
}
POST file-path-test/_bulk
{"index":{"_id":1}}
{"file_path": "/User/alice/photos/2017/05/16/my_photo1.jpg"}
{"index":{"_id":2}}
{"file_path": "/User/alice/photos/2017/05/16/my_photo2.jpg"}
{"index":{"_id":3}}
{"file_path": "/User/alice/photos/2017/05/16/my_photo3.jpg"}
{"index":{"_id":4}}
{"file_path": "/User/alice/photos/2017/05/15/my_photo1.jpg"}
{"index":{"_id":5}}
{"file_path": "/User/bob/photos/2017/05/16/my_photo1.jpg"}
GET file-path-test/_search
{
"query": {
"match": {
"file_path": "/User/bob/photos/2017/05"
}
}
}
GET file-path-test/_search
{
"query": {
"term": {
"file_path.tree": "/User/alice/photos/2017/05/16"
}
}
}
GET file-path-test/_search
{
"query": {
"term": {
"file_path.tree_reversed": {
"value": "my_photo1.jpg"
}
}
}
}
POST file-path-test/_analyze
{
"analyzer": "custom_path_tree",
"text": "/User/alice/photos/2017/05/16/my_photo1.jpg"
}
POST file-path-test/_analyze
{
"analyzer": "custom_path_tree_reversed",
"text": "/User/alice/photos/2017/05/16/my_photo1.jpg"
}
GET file-path-test/_search
{
"query": {
"bool" : {
"must" : {
"match" : { "file_path" : "16" }
},
"filter": {
"term" : { "file_path.tree" : "/User/alice" }
}
}
}
}