Created
May 12, 2023 03:52
-
-
Save iwalton3/b76d052e09b7ddec1ff5e4cc178f5713 to your computer and use it in GitHub Desktop.
Message Splitter - Split chat messages into chunks for training GPTs
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"data": "%data%" | |
} |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python3 | |
import json | |
import datetime | |
import re | |
# Licensed under the MIT License. | |
# Usage is unlimited, but please be weary of the consequences of training GPTs on human messages. | |
# Attempting to mimic a human personality via GPT could be very harmful depending on the context. | |
# To use, specify the values below and run. You can then train the resulting file | |
# in text-generation-webui as a LoRA using the data-format.json format file. | |
# messages must be within 5 minutes of each other | |
chunk_time = 60*5 | |
# minimum number of messages in a chunk, or block is thrown out | |
# I would suggest changing this to 5 for real data | |
min_messages = 3 | |
# if a thread is longer than this, split into overlapping blocks | |
block_size = 10 | |
input_file = 'example-chat-fmt.json' | |
output_file = 'dataset-chat-split.json' | |
# phrases or words in these lists will throw out the respective block | |
# AND cause a message thread split | |
disallowed_phrases = [] | |
disallowed_words = [] | |
word_regex = re.compile(r'([a-zA-Z_]+)') | |
half_block_size = block_size // 2 | |
with open(output_file,'w') as out: | |
message_chunks = [] | |
inp = json.load(open(input_file)) | |
for thread in inp: | |
messages = [] | |
last_ts = None | |
flag_split = False | |
for msg in thread['data']: | |
lowercase_content = msg['message'].lower() | |
should_skip = False | |
for phrase in disallowed_phrases: | |
if phrase in lowercase_content: | |
should_skip = True | |
flag_split = True | |
continue | |
for word in word_regex.findall(lowercase_content): | |
if word in disallowed_words: | |
should_skip = True | |
flag_split = True | |
continue | |
if should_skip: | |
continue | |
message_ts = datetime.datetime.fromisoformat(msg['timestamp']).timestamp() | |
if last_ts is not None and message_ts - last_ts > chunk_time or flag_split: | |
flag_split = False | |
if len(messages) > min_messages: | |
message_chunks.append(messages) | |
messages = [] | |
elif len(messages) >= block_size: | |
message_chunks.append(messages) | |
messages = messages[half_block_size:] | |
last_ts = message_ts | |
messages.append(msg) | |
if len(messages) >= min_messages: | |
message_chunks.append(messages) | |
training_data = [] | |
for chunk in message_chunks: | |
last_message = None | |
for message in chunk: | |
if last_message is not None and message['user'] != last_message['user']: | |
last_message['message'] += '<!end!>' | |
last_message = message | |
last_message['message'] += '<!end!>' | |
current_author = None | |
for message in chunk: | |
if current_author is None or message['user'] != current_author: | |
message['include_author'] = True | |
else: | |
message['include_author'] = False | |
current_author = message['user'] | |
text = '\n'.join(f"{msg['user']}: {msg['message']}" if msg['include_author'] else msg['user'] for msg in chunk) | |
text = text.replace('<!end!><!end!>', '<!end!>').replace('<!end!><!end!>', '<!end!>').replace('<!end!><!end!>', '<!end!>') | |
training_data.append({"data": text}) | |
print("Created", len(training_data), "training data chunks") | |
json.dump(training_data, out) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
[ | |
{ | |
"data": [ | |
{ | |
"timestamp": "2023-05-11T12:00:00Z", | |
"user": "Alice", | |
"message": "Hey, how's it going?" | |
}, | |
{ | |
"timestamp": "2023-05-11T12:01:00Z", | |
"user": "Bob", | |
"message": "Pretty good! Just working on that project due next week." | |
}, | |
{ | |
"timestamp": "2023-05-11T12:04:00Z", | |
"user": "Alice", | |
"message": "Oh, right. How's that coming along?" | |
} | |
] | |
}, | |
{ | |
"data": [ | |
{ | |
"timestamp": "2023-05-11T12:10:00Z", | |
"user": "Bob", | |
"message": "I'm having some issues with the calculations." | |
}, | |
{ | |
"timestamp": "2023-05-11T12:11:00Z", | |
"user": "Alice", | |
"message": "I can help with that, if you want." | |
}, | |
{ | |
"timestamp": "2023-05-11T12:13:00Z", | |
"user": "Bob", | |
"message": "That would be great, thanks!" | |
} | |
] | |
} | |
] |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment