Last active
July 12, 2018 05:30
-
-
Save jbaiter/00e57d10aed45432931d3fd3c0465a12 to your computer and use it in GitHub Desktop.
Print size distribution of your Solr index
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python3 | |
# coding=utf8 | |
""" | |
CFE file layout: | |
Header: | |
CodecHeader: | |
Magic [4byte uint32] | |
Codecname [?byte String] | |
Version [4byte uint32] | |
ObjectId [16byte] | |
SuffixLength [1byte] | |
SuffixBytes [`SuffixLength` bytes] | |
FileCount [?byte VInt] | |
<repeated>: | |
FileName [?byte String] | |
DataOffset [8byte uint64] | |
DataLength [8byte uint64] | |
""" | |
import os | |
import struct | |
import sys | |
from collections import defaultdict, namedtuple | |
from itertools import chain | |
TYPE_MAPPING = { | |
'si': 'Segment Info', | |
'fnm': 'Fields Info', | |
'fdx': 'Fields Index', | |
'fdt': 'Field Data', | |
'tim': 'Term Dictionary', | |
'tip': 'Term Index', | |
'doc': 'Frequencies', | |
'pos': 'Positions', | |
'pay': 'Payloads', | |
'nvd': 'Norms (nvd)', | |
'nvm': 'Norms (nvm)', | |
'dvd': 'Per-Document Values (dvd)', | |
'dvm': 'Per-Document Values (dvm)', | |
'tvx': 'Term Vector Index', | |
'tvd': 'Term Vector Documents', | |
'tvf': 'Term Vector Fields', | |
'liv': 'Live Documents', | |
'dii': 'Point values (dii)', | |
'dim': 'Point values (dim)' | |
} | |
Header = namedtuple('Header', ['codec_name', 'codec_version', 'object_id', | |
'suffix']) | |
File = namedtuple('File', ['name', 'offset', 'length']) | |
def parse_vint(fp): | |
b = ord(fp.read(1)) | |
i = b & 0x7f | |
shift = 7 | |
while (b & 0x80) != 0: | |
b = ord(fp.read(1)) | |
i |= (b & 0x7f) << shift | |
shift += 7 | |
return i | |
def parse_string(fp): | |
length = parse_vint(fp) | |
return fp.read(length).decode('utf8') | |
def parse_header(fp): | |
magic = struct.unpack('>I', fp.read(4))[0] | |
assert magic == 1071082519 | |
name = parse_string(fp) | |
version = struct.unpack('>I', fp.read(4))[0] | |
object_id = fp.read(16) | |
suffix_len = ord(fp.read(1)) | |
if suffix_len > 0: | |
suffix = fp.read(suffix_len) | |
else: | |
suffix = None | |
return Header(codec_name=name, codec_version=version, object_id=object_id, | |
suffix=suffix) | |
def parse_files(fp): | |
num_files = parse_vint(fp) | |
for fnum in range(num_files): | |
if fp.tell() == os.fstat(fp.fileno()).st_size: | |
break | |
fname = parse_string(fp) | |
offset = struct.unpack('>Q', fp.read(8))[0] | |
length = struct.unpack('>Q', fp.read(8))[0] | |
yield File(fname, offset, length) | |
def parse_cfe(fname): | |
try: | |
with open(fname, 'rb') as fp: | |
header = parse_header(fp) | |
files = list(parse_files(fp)) | |
return header, files | |
except: | |
print("Could not read {}".format(fname), file=sys.stderr) | |
return None, [] | |
def analyze_index(data_dir): | |
index_dir = os.path.join( | |
data_dir, next(fname for fname in os.listdir(data_dir) | |
if fname.startswith('index') and | |
os.path.isdir(os.path.join(data_dir, fname)))) | |
cfe_infos = chain.from_iterable( | |
parse_cfe(os.path.join(index_dir, cfename))[1] | |
for cfename in os.listdir(index_dir) | |
if cfename is not None and cfename.endswith('.cfe')) | |
single_infos = {fname: os.stat(os.path.join(index_dir, fname)).st_size | |
for fname in os.listdir(index_dir) | |
if os.path.splitext(fname)[1] not in ('.cfe', '.cfs', '.lock')} | |
stats = defaultdict(int) | |
for cfe in cfe_infos: | |
if cfe is None: | |
continue | |
ftype = cfe.name[1:] if cfe.name.startswith('.') else os.path.splitext(cfe.name)[1][1:] | |
if not ftype: | |
continue | |
stats[ftype] += cfe.length | |
for fname, size in single_infos.items(): | |
ftype = os.path.splitext(fname)[1][1:] | |
if not ftype: | |
continue | |
stats[ftype] += size | |
total_size = sum(stats.values()) | |
for ftype, size in sorted(stats.items(), key=lambda x: x[1]): | |
print("{:<25} {:>6.1f}GiB {:>5.1f}%".format( | |
TYPE_MAPPING[ftype], | |
size / 1024. / 1024. / 1024., | |
(float(size) / total_size) * 100)) | |
print("="*42) | |
print("{:<25} {:>6.1f}GiB".format("Total", total_size / 1024. / 1024. / 1024.)) | |
if __name__ == '__main__': | |
if len(sys.argv) != 2: | |
print("Please run with the Solr data directory as the first argument.") | |
else: | |
analyze_index(sys.argv[1]) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment