Created
September 24, 2018 13:49
-
-
Save mingodad/7fdec8eebdde70ee388db60855760c72 to your computer and use it in GitHub Desktop.
Implementation of "compressed" and "min_word_size" option for columns in fts5 for sqlite3
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Index: ext/fts5/fts5Int.h | |
================================================================== | |
--- ext/fts5/fts5Int.h | |
+++ ext/fts5/fts5Int.h | |
@@ -160,10 +160,11 @@ | |
char *zDb; /* Database holding FTS index (e.g. "main") */ | |
char *zName; /* Name of FTS index */ | |
int nCol; /* Number of columns */ | |
char **azCol; /* Column names */ | |
u8 *abUnindexed; /* True for unindexed columns */ | |
+ u8 *abCompressed; /* True for compressed columns */ | |
int nPrefix; /* Number of prefix indexes */ | |
int *aPrefix; /* Sizes in bytes of nPrefix prefix indexes */ | |
int eContent; /* An FTS5_CONTENT value */ | |
char *zContent; /* content table */ | |
char *zContentRowid; /* "content_rowid=" option value */ | |
@@ -183,10 +184,14 @@ | |
char *zRank; /* Name of rank function */ | |
char *zRankArgs; /* Arguments to rank function */ | |
/* If non-NULL, points to sqlite3_vtab.base.zErrmsg. Often NULL. */ | |
char **pzErrmsg; | |
+ | |
+ /* Optional registered sqlite function for de/compression */ | |
+ char *zCompressFunc; | |
+ char *zUnCompressFunc; | |
#ifdef SQLITE_DEBUG | |
int bPrefixIndex; /* True to use prefix-indexes */ | |
#endif | |
}; | |
Index: ext/fts5/fts5_config.c | |
================================================================== | |
--- ext/fts5/fts5_config.c | |
+++ ext/fts5/fts5_config.c | |
@@ -383,10 +383,30 @@ | |
if( (rc = fts5ConfigSetEnum(aDetail, zArg, &pConfig->eDetail)) ){ | |
*pzErr = sqlite3_mprintf("malformed detail=... directive"); | |
} | |
return rc; | |
} | |
+ | |
+ if( sqlite3_strnicmp("compress", zCmd, nCmd)==0 ){ | |
+ if( pConfig->zCompressFunc ){ | |
+ *pzErr = sqlite3_mprintf("multiple compress=... directives"); | |
+ rc = SQLITE_ERROR; | |
+ }else{ | |
+ pConfig->zCompressFunc = sqlite3Fts5Strndup(&rc, zArg, -1); | |
+ } | |
+ return rc; | |
+ } | |
+ | |
+ if( sqlite3_strnicmp("uncompress", zCmd, nCmd)==0 ){ | |
+ if( pConfig->zUnCompressFunc ){ | |
+ *pzErr = sqlite3_mprintf("multiple uncompress=... directives"); | |
+ rc = SQLITE_ERROR; | |
+ }else{ | |
+ pConfig->zUnCompressFunc = sqlite3Fts5Strndup(&rc, zArg, -1); | |
+ } | |
+ return rc; | |
+ } | |
*pzErr = sqlite3_mprintf("unrecognized option: \"%.*s\"", nCmd, zCmd); | |
return SQLITE_ERROR; | |
} | |
@@ -470,10 +490,12 @@ | |
*pzErr = sqlite3_mprintf("reserved fts5 column name: %s", zCol); | |
rc = SQLITE_ERROR; | |
}else if( zArg ){ | |
if( 0==sqlite3_stricmp(zArg, "unindexed") ){ | |
p->abUnindexed[p->nCol] = 1; | |
+ }else if( 0==sqlite3_stricmp(zArg, "compressed") ){ | |
+ p->abCompressed[p->nCol] = 1; | |
}else{ | |
*pzErr = sqlite3_mprintf("unrecognized column option: %s", zArg); | |
rc = SQLITE_ERROR; | |
} | |
} | |
@@ -486,19 +508,21 @@ | |
** Populate the Fts5Config.zContentExprlist string. | |
*/ | |
static int fts5ConfigMakeExprlist(Fts5Config *p){ | |
int i; | |
int rc = SQLITE_OK; | |
+ const char *zFunc; | |
Fts5Buffer buf = {0, 0, 0}; | |
sqlite3Fts5BufferAppendPrintf(&rc, &buf, "T.%Q", p->zContentRowid); | |
if( p->eContent!=FTS5_CONTENT_NONE ){ | |
for(i=0; i<p->nCol; i++){ | |
+ zFunc = p->abCompressed[i] ? p->zUnCompressFunc : ""; | |
if( p->eContent==FTS5_CONTENT_EXTERNAL ){ | |
- sqlite3Fts5BufferAppendPrintf(&rc, &buf, ", T.%Q", p->azCol[i]); | |
+ sqlite3Fts5BufferAppendPrintf(&rc, &buf, ", %s(T.%Q)", zFunc, p->azCol[i]); | |
}else{ | |
- sqlite3Fts5BufferAppendPrintf(&rc, &buf, ", T.c%d", i); | |
+ sqlite3Fts5BufferAppendPrintf(&rc, &buf, ", %s(T.c%d)", zFunc, i); | |
} | |
} | |
} | |
assert( p->zContentExprlist==0 ); | |
@@ -535,13 +559,15 @@ | |
if( pRet==0 ) return SQLITE_NOMEM; | |
memset(pRet, 0, sizeof(Fts5Config)); | |
pRet->db = db; | |
pRet->iCookie = -1; | |
- nByte = nArg * (sizeof(char*) + sizeof(u8)); | |
+ nByte = nArg * (sizeof(char*) + (sizeof(u8)*2)); | |
pRet->azCol = (char**)sqlite3Fts5MallocZero(&rc, nByte); | |
pRet->abUnindexed = (u8*)&pRet->azCol[nArg]; | |
+ pRet->abCompressed = (u8*)&pRet->abUnindexed[nArg]; | |
+ pRet->zCompressFunc = pRet->zUnCompressFunc = NULL; | |
pRet->zDb = sqlite3Fts5Strndup(&rc, azArg[1], -1); | |
pRet->zName = sqlite3Fts5Strndup(&rc, azArg[2], -1); | |
pRet->bColumnsize = 1; | |
pRet->eDetail = FTS5_DETAIL_FULL; | |
#ifdef SQLITE_DEBUG | |
@@ -589,10 +615,26 @@ | |
} | |
sqlite3_free(zOne); | |
sqlite3_free(zTwo); | |
} | |
+ | |
+ if( rc==SQLITE_OK ){ | |
+ int i; | |
+ for(i=0; i<pRet->nCol; i++){ | |
+ if( pRet->abCompressed[i] ){ | |
+ /*if at least one column was declared compress*/ | |
+ if( !pRet->zCompressFunc || !pRet->zUnCompressFunc ) { | |
+ char const *zMiss = (pRet->zCompressFunc==0 ? "compress" : "uncompress"); | |
+ rc = SQLITE_ERROR; | |
+ sqlite3Fts3ErrMsg(pzErr, "missing %s parameter in fts4 constructor", zMiss); | |
+ } | |
+ break; | |
+ } | |
+ } | |
+ } | |
+ | |
/* If a tokenizer= option was successfully parsed, the tokenizer has | |
** already been allocated. Otherwise, allocate an instance of the default | |
** tokenizer (unicode61) now. */ | |
if( rc==SQLITE_OK && pRet->pTok==0 ){ | |
@@ -653,10 +695,16 @@ | |
sqlite3_free(pConfig->zRank); | |
sqlite3_free(pConfig->zRankArgs); | |
sqlite3_free(pConfig->zContent); | |
sqlite3_free(pConfig->zContentRowid); | |
sqlite3_free(pConfig->zContentExprlist); | |
+ if(pConfig->zCompressFunc) { | |
+ sqlite3_free(pConfig->zCompressFunc); | |
+ } | |
+ if(pConfig->zUnCompressFunc) { | |
+ sqlite3_free(pConfig->zUnCompressFunc); | |
+ } | |
sqlite3_free(pConfig); | |
} | |
} | |
/* | |
Index: ext/fts5/fts5_storage.c | |
================================================================== | |
--- ext/fts5/fts5_storage.c | |
+++ ext/fts5/fts5_storage.c | |
@@ -111,19 +111,25 @@ | |
case FTS5_STMT_INSERT_CONTENT: | |
case FTS5_STMT_REPLACE_CONTENT: { | |
int nCol = pC->nCol + 1; | |
char *zBind; | |
- int i; | |
+ const char *zFunc; | |
+ int i, zFuncSize, zBindSize, bSizeUsed; | |
- zBind = sqlite3_malloc(1 + nCol*2); | |
+ /* Add 4 to take in account the extra '(?),' */ | |
+ zFuncSize = (int)(pC->zCompressFunc ? strlen(pC->zCompressFunc) : 0)+4; | |
+ zBindSize = 1 + nCol*zFuncSize; | |
+ zBind = sqlite3_malloc(zBindSize); | |
if( zBind ){ | |
+ bSizeUsed = 0; | |
for(i=0; i<nCol; i++){ | |
- zBind[i*2] = '?'; | |
- zBind[i*2 + 1] = ','; | |
+ zFunc = (i && pC->abCompressed[i-1]) ? pC->zCompressFunc : ""; | |
+ sqlite3_snprintf(zBindSize-bSizeUsed, zBind+bSizeUsed, "%s(?),", zFunc); | |
+ bSizeUsed = (int)strlen(zBind); | |
} | |
- zBind[i*2-1] = '\0'; | |
+ zBind[bSizeUsed-1] = '\0'; /* remove the last comma */ | |
zSql = sqlite3_mprintf(azStmt[eStmt], pC->zDb, pC->zName, zBind); | |
sqlite3_free(zBind); | |
} | |
break; | |
} | |
Index: ext/fts5/fts5_tokenize.c | |
================================================================== | |
--- ext/fts5/fts5_tokenize.c | |
+++ ext/fts5/fts5_tokenize.c | |
@@ -233,10 +233,11 @@ | |
struct Unicode61Tokenizer { | |
unsigned char aTokenChar[128]; /* ASCII range token characters */ | |
char *aFold; /* Buffer to fold text into */ | |
int nFold; /* Size of aFold[] in bytes */ | |
int bRemoveDiacritic; /* True if remove_diacritics=1 is set */ | |
+ int nMinWordSize; /* Min size of a word to be indexed */ | |
int nException; | |
int *aiException; | |
unsigned char aCategory[32]; /* True for token char categories */ | |
}; | |
@@ -360,10 +361,11 @@ | |
const char *zCat = "L* N* Co"; | |
int i; | |
memset(p, 0, sizeof(Unicode61Tokenizer)); | |
p->bRemoveDiacritic = 1; | |
+ p->nMinWordSize = 0; | |
p->nFold = 64; | |
p->aFold = sqlite3_malloc(p->nFold * sizeof(char)); | |
if( p->aFold==0 ){ | |
rc = SQLITE_NOMEM; | |
} | |
@@ -393,10 +395,14 @@ | |
if( 0==sqlite3_stricmp(azArg[i], "separators") ){ | |
rc = fts5UnicodeAddExceptions(p, zArg, 0); | |
}else | |
if( 0==sqlite3_stricmp(azArg[i], "categories") ){ | |
/* no-op */ | |
+ }else | |
+ if( 0==sqlite3_stricmp(azArg[i], "min_word_size") ){ | |
+ int mwsz; | |
+ if( sqlite3GetInt32(zArg, &mwsz) ) p->nMinWordSize = mwsz; | |
}else{ | |
rc = SQLITE_ERROR; | |
} | |
} | |
@@ -450,10 +456,11 @@ | |
while( rc==SQLITE_OK ){ | |
int iCode; /* non-ASCII codepoint read from input */ | |
char *zOut = aFold; | |
int is; | |
int ie; | |
+ int wsz; | |
/* Skip any separator characters. */ | |
while( 1 ){ | |
if( zCsr>=zTerm ) goto tokenize_done; | |
if( *zCsr & 0x80 ) { | |
@@ -517,12 +524,15 @@ | |
zCsr++; | |
} | |
ie = zCsr - (unsigned char*)pText; | |
} | |
+ wsz = zOut-aFold; | |
+ /* Check min word size */ | |
+ if(p->nMinWordSize && p->nMinWordSize > wsz) continue; | |
/* Invoke the token callback */ | |
- rc = xToken(pCtx, 0, aFold, zOut-aFold, is, ie); | |
+ rc = xToken(pCtx, 0, aFold, wsz, is, ie); | |
} | |
tokenize_done: | |
if( rc==SQLITE_DONE ) rc = SQLITE_OK; | |
return rc; |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment