Last active
August 9, 2022 12:33
-
-
Save aymanfarhat/456cbf52adb9b252881e40f855cefb6d to your computer and use it in GitHub Desktop.
Example of extracting and flattening git logs and writing them into BigQuery
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#Copyright 2022 Google LLC. | |
#SPDX-License-Identifier: Apache-2.0 | |
bq --location="EU" load \ | |
--replace \ | |
--source_format="NEWLINE_DELIMITED_JSON" \ | |
--autodetect \ | |
your_dataset.your_table \ | |
./logs.jsonl |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{"commit_hash": "1e0e4b74e2fad3a", "commit_subject": "Example commit 1", "author_name": "Author 1", "author_email": "[email protected]", "author_date": "1655894118", "commiter_email": "[email protected]", "commiter_name": "GitHub", "file": "tools/sometool1/file.py\n"} | |
{"commit_hash": "e551dd384acc13f", "commit_subject": "Example commit 2", "author_name": "Author 2", "author_email": "[email protected]", "author_date": "1655821884", "commiter_email": "[email protected]", "commiter_name": "GitHub", "file": "tools/sometool2/path/config.yaml\n"} | |
{"commit_hash": "f96748b7b3b8f54", "commit_subject": "Example commit 3", "author_name": "Author 3", "author_email": "[email protected]", "author_date": "1655797941", "commiter_email": "[email protected]", "commiter_name": "GitHub", "file": "tools/sometool3/README.md\n"} | |
{"commit_hash": "f96748b7b3b8f54", "commit_subject": "Example commit 3", "author_name": "Author 3", "author_email": "[email protected]", "author_date": "1655797941", "commiter_email": "[email protected]", "commiter_name": "GitHub", "file": "tools/sometool3/src/main/java/functions/SendNotification.java\n"} | |
{"commit_hash": "f96748b7b3b8f54", "commit_subject": "Example commit 3", "author_name": "Author 3", "author_email": "[email protected]", "author_date": "1655797941", "commiter_email": "[email protected]", "commiter_name": "GitHub", "file": "tools/sometool3/src/main/java/functions/eventpojos/PubSubMessage.java\n"} | |
{"commit_hash": "f96748b7b3b8f54", "commit_subject": "Example commit 3", "author_name": "Author 3", "author_email": "[email protected]", "author_date": "1655797941", "commiter_email": "[email protected]", "commiter_name": "GitHub", "file": "tools/sometool3/src/test/java/functions/SendNotificationTest.java\n"} |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#Copyright 2022 Google LLC. | |
#SPDX-License-Identifier: Apache-2.0 | |
python extract.py \ | |
--git-dir='/path/to/your/repository/.git' \ | |
--branch='main' \ | |
--output='./logs.jsonl' |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#Copyright 2022 Google LLC. | |
#SPDX-License-Identifier: Apache-2.0 | |
""" | |
A simple script for extracting and flattening git logs per commit into: | |
file, commit, subject, author, commiter, date. formatted as JSONL, | |
compatible for loading into BigQuery | |
""" | |
import io | |
import json | |
import re | |
import argparse | |
import subprocess | |
def get_logs(git_dir, fields, branch): | |
""" | |
Runs a git log command and returns output results line by line | |
""" | |
pretty_format = '%n'.join([f'{k}: {v}' for k, v in fields]) | |
command = [ | |
'git', '--git-dir', git_dir, 'log', f'origin/{branch}', '--name-only', | |
f'--pretty=format:{pretty_format}' | |
] | |
proc = subprocess.Popen(command, stdout=subprocess.PIPE) | |
for line in io.TextIOWrapper(proc.stdout, encoding='utf-8'): | |
yield line | |
def parse_logs(logs, fields): | |
""" | |
Iterate through a git log output line by line, flatten by yielding | |
a log item for every file in each commit. Each new log item is a | |
dict containing file name, author, commit hash, commit date etc... | |
""" | |
state = {k: None for k, v in fields} | |
file_path_pattern = re.compile(r'^([a-zA-Z_\-\s0-9]+)(\/[a-zA-Z_\-\s0-9\.]+)+(\.[a-zA-Z]+)?$') | |
commit_detail_pattern = re.compile(r'^(?P<type>.*): (?P<value>.*)') | |
for line in logs: | |
if file_path_pattern.match(line): | |
temp_out = dict(state) | |
temp_out['file'] = line | |
yield temp_out | |
else: | |
matches = commit_detail_pattern.match(line) | |
if matches: | |
groups = matches.groups() | |
k = groups[0].strip() | |
if k in state: | |
state[k] = groups[1].strip() | |
def main(): | |
parser = argparse.ArgumentParser( | |
description='Utility to extract git logs into jsonl') | |
parser.add_argument('--git-dir', | |
required=True, | |
help='Git directory to read logs from') | |
parser.add_argument('--branch', | |
required=True, | |
help='Git branch to read logs from') | |
parser.add_argument('--output', | |
required=True, | |
help='Target file to write transformed logs to') | |
args = parser.parse_args() | |
fields = [('commit_hash', '%H'), ('commit_subject', '%s'), | |
('author_name', '%aN'), ('author_email', '%aE'), | |
('author_date', '%at'), ('commiter_email', '%ce'), | |
('commiter_name', '%cn')] | |
logs = get_logs(args.git_dir, fields, args.branch) | |
parsed_logs = parse_logs(logs, fields) | |
with open(args.output, mode='w', encoding='UTF-8') as file: | |
for line in parsed_logs: | |
file.write(f'{json.dumps(line)}\n') | |
if __name__ == '__main__': | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment