Created
November 17, 2022 11:14
-
-
Save nihalkenkre/09453c4842b632cb6937d9217b1f207e to your computer and use it in GitHub Desktop.
Parse a Python Pickled .pkl file into an actionable JSON object
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
const fs = require('fs'); | |
var memo = [] | |
var stack = [] | |
opCodes = { | |
// Integers | |
INT: 0x49, | |
BININT: 0x4a, | |
BININT1: 0x4b, | |
BININT2: 0x4d, | |
LONG: 0x4c, | |
LONG1: 0x8a, | |
LONG4: 0x8b, | |
// Strings | |
STRING: 0x53, | |
BINSTRING: 0x54, | |
SHORT_BINSTRING: 0x55, | |
// Bytes | |
BINBYTES: 0x42, | |
SHORT_BINBYTES: 0x43, | |
BINBYTES8: 0x8e, | |
// Byte Array | |
BYTEARRAY8: 0x96, | |
// Out of band buffer | |
NEXT_BUFFER: 0x97, | |
READONLY_BUFFER: 0x98, | |
// None | |
NONE: 0x4e, | |
// Booleans | |
NEWTRUE: 0x88, | |
NEWFALSE: 0x89, | |
// Unicode strings | |
UNICODE: 0x56, | |
SHORT_BINUNICODE: 0x8c, | |
BINUNICODE: 0x58, | |
BINUNICODE8: 0x8d, | |
// Floats | |
FLOAT: 0x46, | |
BINFLOAT: 0x47, | |
// Lists | |
EMPTY_LIST: 0x5d, | |
APPEND: 0x61, | |
APPENDS: 0x65, | |
LIST: 0x6c, | |
// Tuples | |
EMPTY_TUPLE: 0x29, | |
TUPLE: 0x74, | |
TUPLE1: 0x85, | |
TUPLE2: 0x86, | |
TUPLE3: 0x87, | |
// Dicts | |
EMPTY_DICT: 0x7d, | |
DICT: 0x64, | |
SETITEM: 0x73, | |
SETITEMS: 0x75, | |
// Sets | |
EMPTY_SET: 0x8f, | |
ADDITEMS: 0x90, | |
// Frozen Sets | |
FROZENSET: 0x91, | |
// Stack manipulation | |
POP: 0x30, | |
DUP: 0x32, | |
MARK: 0x28, | |
POP_MARK: 0x31, | |
// Memo manipulation | |
GET: 0x67, | |
BINGET: 0x68, | |
LONG_BINGET: 0x6A, | |
PUT: 0x70, | |
BINPUT: 0x71, | |
LONG_BINPUT: 0x72, | |
MEMOIZE: 0x94, | |
// Extension registry. Like the GET family. | |
EXT1: 0x82, | |
EXT2: 0x83, | |
EXT4: 0x84, | |
// Push a Class Object, Module function on the stack, via its module and name. | |
GLOBAL: 0x63, | |
STACK_GLOBAL: 0x93, | |
// Objects pickle does not know about directly. | |
REDUCE: 0x52, | |
BUILD: 0x62, | |
INST: 0x69, | |
OBJ: 0x6f, | |
NEWOBJ: 0x81, | |
NEWOBJ_EX: 0x92, | |
// Machine Control | |
PROTO: 0x80, | |
STOP: 0x2e, | |
FRAME: 0x95, | |
// Persistent IDs | |
PERSID: 0x50, | |
BINPERSID: 0x51 | |
}; | |
function findMark(stackString) { | |
return stackString == 'mark'; | |
} | |
function processOpcode(opCode, data) { | |
switch (opCode) { | |
// Integers | |
case opCodes.INT: | |
break; | |
case opCodes.BININT: | |
const intBuffer = Buffer.alloc(4); | |
intBuffer[0] = data.next().value; | |
intBuffer[1] = data.next().value; | |
intBuffer[2] = data.next().value; | |
intBuffer[3] = data.next().value; | |
stack.push(intBuffer.readInt32LE()); | |
break; | |
case opCodes.BININT1: | |
stack.push(data.next().value); | |
break; | |
case opCodes.BININT2: | |
{ | |
const dataBuffer = Buffer.alloc(2); | |
dataBuffer[0] = data.next().value; | |
dataBuffer[1] = data.next().value; | |
stack.push(dataBuffer.readUint16LE()); | |
} | |
break; | |
case opCodes.LONG: | |
break; | |
case opCodes.LONG1: | |
break; | |
case opCodes.LONG4: | |
break; | |
// Strings | |
case opCodes.STRING: | |
break; | |
case opCodes.BINSTRING: | |
break; | |
case opCodes.SHORT_BINSTRING: | |
break; | |
// Bytes | |
case opCodes.BINBYTES: | |
break; | |
case opCodes.SHORT_BINBYTES: | |
break; | |
case opCodes.BINBYTES8: | |
break; | |
// Byte Array | |
case opCodes.BYTEARRAY8: | |
{ | |
const numBytesBuffer = Buffer.alloc(8); | |
numBytesBuffer[0] = data.next().value; | |
numBytesBuffer[1] = data.next().value; | |
numBytesBuffer[2] = data.next().value; | |
numBytesBuffer[3] = data.next().value; | |
numBytesBuffer[4] = data.next().value; | |
numBytesBuffer[5] = data.next().value; | |
numBytesBuffer[6] = data.next().value; | |
numBytesBuffer[7] = data.next().value; | |
const numContentBytes = Number(numBytesBuffer.readBigUInt64LE()); | |
const contentBuffer = Buffer.alloc(numContentBytes); | |
for (var i = 0; i < numContentBytes; ++i) { | |
contentBuffer[i] = data.next().value; | |
} | |
stack.push(contentBuffer); | |
} | |
break; | |
// Out of band buffer | |
case opCodes.NEXT_BUFFER: | |
break; | |
case opCodes.READONLY_BUFFER: | |
break; | |
// None | |
case opCodes.NONE: | |
stack.push(null); | |
break; | |
// Booleans | |
case opCodes.NEWTRUE: | |
stack.push(true); | |
break; | |
case opCodes.NEWFALSE: | |
stack.push(false); | |
break; | |
// Unicode strings | |
case opCodes.UNICODE: | |
break; | |
case opCodes.SHORT_BINUNICODE: | |
{ | |
const byteCount = data.next().value; | |
const byteBuffer = Buffer.allocUnsafe(byteCount); | |
for (var i = 0; i < byteCount; ++i) { | |
byteBuffer.writeUint8(data.next().value, i); | |
} | |
stack.push(byteBuffer.toString()); | |
} | |
break; | |
case opCodes.BINUNICODE: | |
break; | |
case opCodes.BINUNICODE8: | |
break; | |
// Floats | |
case opCodes.FLOAT: | |
break; | |
case opCodes.BINFLOAT: | |
{ | |
const dataBuffer = Buffer.alloc(8); | |
dataBuffer[0] = data.next().value; | |
dataBuffer[1] = data.next().value; | |
dataBuffer[2] = data.next().value; | |
dataBuffer[3] = data.next().value; | |
dataBuffer[4] = data.next().value; | |
dataBuffer[5] = data.next().value; | |
dataBuffer[6] = data.next().value; | |
dataBuffer[7] = data.next().value; | |
stack.push(dataBuffer.readDoubleBE()); | |
} | |
break; | |
// Lists | |
case opCodes.EMPTY_LIST: | |
stack.push([]); | |
break; | |
case opCodes.APPEND: | |
{ | |
const list = stack[stack.length - 2]; | |
list.push(stack.pop()); | |
} | |
break; | |
case opCodes.APPENDS: | |
{ | |
const markIndex = stack.findLastIndex(findMark); | |
var list = stack[markIndex - 1]; | |
for (var i = markIndex + 1; i < stack.length; ++i) { | |
list.push(stack[i]); | |
} | |
stack.splice(markIndex, stack.length - markIndex); | |
} | |
break; | |
case opCodes.LIST: | |
break; | |
// Tuples | |
case opCodes.EMPTY_TUPLE: | |
stack.push([]); | |
break; | |
case opCodes.TUPLE: | |
{ | |
const markIndex = stack.findLastIndex(findMark); | |
const list = []; | |
for (var i = markIndex + 1; i < stack.length; ++i) { | |
list.push(stack[i]); | |
} | |
stack.splice(markIndex, stack.length - markIndex); | |
stack.push(list); | |
} | |
break; | |
case opCodes.TUPLE1: | |
stack.push([stack.pop()]); | |
break; | |
case opCodes.TUPLE2: | |
{ | |
const t1 = stack.pop(); | |
const t2 = stack.pop(); | |
stack.push([t2, t1]); | |
} | |
break; | |
case opCodes.TUPLE3: | |
{ | |
const t1 = stack.pop(); | |
const t2 = stack.pop(); | |
const t3 = stack.pop(); | |
stack.push([t3, t2, t1]) | |
} | |
break; | |
// Dicts | |
case opCodes.EMPTY_DICT: | |
stack.push({}); | |
break; | |
case opCodes.DICT: | |
break; | |
case opCodes.SETITEM: | |
break; | |
case opCodes.SETITEMS: | |
{ | |
const markIndex = stack.findLastIndex(findMark); | |
var dict = stack[markIndex - 1]; | |
for (var i = markIndex + 1; i < stack.length - 1; i += 2) { | |
dict[stack[i]] = stack[i + 1]; | |
} | |
stack.splice(markIndex, stack.length - markIndex); | |
} | |
break; | |
// Sets | |
case opCodes.EMPTY_SET: | |
break; | |
case opCodes.ADDITEMS: | |
break; | |
// Frozen Sets | |
case opCodes.FROZENSET: | |
break; | |
// Stack manipulation | |
case opCodes.POP: | |
break; | |
case opCodes.DUP: | |
break; | |
case opCodes.MARK: | |
stack.push('mark'); | |
break; | |
case opCodes.POP_MARK: | |
break; | |
// Memo manipulation | |
case opCodes.GET: | |
break; | |
case opCodes.BINGET: | |
{ | |
const memoIndex = data.next().value; | |
stack.push(memo[memoIndex]); | |
} | |
break; | |
case opCodes.LONG_BINGET: | |
break; | |
case opCodes.PUT: | |
break; | |
case opCodes.BINPUT: | |
break; | |
case opCodes.LONG_BINPUT: | |
break; | |
case opCodes.MEMOIZE: | |
memo[memo.length] = stack[stack.length - 1]; | |
break; | |
// Extension registry. Like the GET family. | |
case opCodes.EXT1: | |
break; | |
case opCodes.EXT2: | |
break; | |
case opCodes.EXT4: | |
break; | |
// Push a Class Object, Module function on the stack, via its module and name. | |
case opCodes.GLOBAL: | |
console.log('\n===== opcode GLOBAL Not yet implemented =====\n'); | |
break; | |
case opCodes.STACK_GLOBAL: | |
{ | |
const cls = stack.pop(); | |
const mod = stack.pop(); | |
const obj = { 'module': mod, 'class': cls } | |
stack.push(obj); | |
} | |
break; | |
// Objects pickle does not know about directly. | |
case opCodes.REDUCE: | |
const args = stack.pop(); | |
const callable = stack.pop(); | |
stack.push({ 'callable': callable, 'reduce_args': args }); | |
break; | |
case opCodes.BUILD: | |
stack[stack.length - 2]['build_args'] = stack[stack.length - 1]; | |
stack.pop(); | |
break; | |
case opCodes.INST: | |
break; | |
case opCodes.OBJ: | |
break; | |
case opCodes.NEWOBJ: | |
stack[0]['attrs'] = stack.pop(); | |
break; | |
case opCodes.NEWOBJ_EX: | |
break; | |
// Machine Control | |
case opCodes.PROTO: | |
break; | |
case opCodes.STOP: | |
object = stack.pop(); | |
return object; | |
case opCodes.FRAME: | |
break; | |
// Persistent IDs | |
case opCodes.PERSID: | |
break; | |
case opCodes.BINPERSID: | |
break; | |
default: | |
break; | |
} | |
} | |
function processData(data) { | |
var next = data.next() | |
var pickledObject; | |
while (!next.done) { | |
pickledObject = processOpcode(next.value, data); | |
next = data.next(); | |
} | |
console.log('OBJECT', JSON.stringify(pickledObject)); | |
} | |
function processPickle(pklFilename) { | |
fs.open(pklFilename, 'r', (err, fd) => { | |
if (err) { | |
throw err; | |
} | |
const headerBuffer = Buffer.allocUnsafe(11); | |
fs.readSync(fd, headerBuffer, 0, 11, null); | |
const frameBuffer = Buffer.alloc(8); | |
frameBuffer[0] = headerBuffer[3]; | |
frameBuffer[1] = headerBuffer[4]; | |
frameBuffer[2] = headerBuffer[5]; | |
frameBuffer[3] = headerBuffer[6]; | |
frameBuffer[4] = headerBuffer[7]; | |
frameBuffer[5] = headerBuffer[8]; | |
frameBuffer[6] = headerBuffer[9]; | |
frameBuffer[7] = headerBuffer[10]; | |
const dataBuffer = Buffer.allocUnsafe(Number(frameBuffer.readBigUInt64LE())); | |
if (fs.readSync(fd, dataBuffer, 0, dataBuffer.length, null) != dataBuffer.length) { | |
throw "File could not be read correctly"; | |
} | |
fs.close(fd); | |
processData(dataBuffer.values()); | |
}); | |
} | |
module.exports = processPickle; |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment