Created
January 12, 2023 18:39
-
-
Save jigpu/d1d6311de32f82fbfdf55a5f4fc52315 to your computer and use it in GitHub Desktop.
Extract and merge the copyright headers from a list of files
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
import sys | |
import re | |
""" | |
Extract and merge the copyright headers from a list of files. | |
Foreach file: | |
1. Find a copyright header | |
2. Split it into "copyright notice" and "licence text" sections | |
3. Merge copyright notices into a common list (ignore minor differences) | |
4. Merge license texts into a common list (ignore minor differences) | |
When done, print out merged copyright notices and license texts | |
""" | |
def getFiletype(filename): | |
extension = filename.lower().rpartition(".")[2] | |
return extension | |
def readBlockComments(fileText, startMarker, endMarker=None): | |
blocks = [] | |
block = None | |
inCommentBlock = False | |
for line in fileText.splitlines(): | |
if startMarker in line: | |
inCommentBlock = True | |
if block is None: | |
block = [] | |
elif endMarker is None: | |
inCommentBlock = False | |
if block is not None: | |
blocks.append("\n".join(block)) | |
block = None | |
if inCommentBlock: | |
string = line | |
partitions = string.partition(startMarker) | |
if partitions[1] != '': | |
string = partitions[2] | |
if endMarker is not None: | |
partitions = string.rpartition(endMarker) | |
if partitions[1] != '': | |
string = partitions[0] | |
block.append(string) | |
if endMarker is not None and endMarker in line: | |
inCommentBlock = False | |
if block is not None: | |
blocks.append("\n".join(block)) | |
block = None | |
if block is not None: | |
blocks.append("\n".join(block)) | |
return blocks | |
def readCComments(fileText): | |
result = [] | |
comments = readBlockComments(fileText, '/*', '*/') | |
for comment in comments: | |
comment = re.sub(r"^\s*\*+ ", "", comment, flags=re.MULTILINE) | |
comment = re.sub(r"^\s*\*+$", "\n", comment, flags=re.MULTILINE) | |
result.append(comment) | |
return result | |
def readCPPComments(fileText): | |
return readBlockComments(fileText, '//') | |
def readShellComments(fileText): | |
comments = readBlockComments(fileText, '#') | |
if len(comments) > 0 and comments[0].startswith("!/"): | |
comments = comments[1:] | |
return comments | |
def readXMLComments(fileText): | |
result = [] | |
comments = readBlockComments(fileText, '<!--', '-->') | |
for comment in comments: | |
comment = re.sub(r"^\s*~+ ", "", comment, flags=re.MULTILINE) | |
comment = re.sub(r"^\s*~+$", "\n", comment, flags=re.MULTILINE) | |
result.append(comment) | |
return result | |
def readPythonBlockComments(fileText): | |
return readBlockComments(fileText, '"""', '"""') | |
def findCommentBlocks(fileText, fileHint=None): | |
blocks = [] | |
if fileHint is None \ | |
or fileHint == 'c' \ | |
or fileHint == 'cpp' \ | |
or fileHint == 'java' \ | |
or fileHint == 'kt': | |
blocks.extend(readCComments(fileText)) | |
blocks.extend(readCPPComments(fileText)) | |
if fileHint is None \ | |
or fileHint == 'sh' \ | |
or fileHint == 'py': | |
blocks.extend(readShellComments(fileText)) | |
if fileHint is None \ | |
or fileHint == 'py': | |
blocks.extend(readPythonBlockComments(fileText)) | |
if fileHint is None \ | |
or fileHint == 'xml' \ | |
or fileHint == 'html': | |
blocks.extend(readXMLComments(fileText)) | |
return blocks | |
def findHeaders(fileText, filetype): | |
commentBlocks = findCommentBlocks(fileText, filetype) | |
commentBlocks = [ block for block in commentBlocks if "copyright" in block.lower() ] | |
return commentBlocks | |
def splitHeader(headerText): | |
notices = [] | |
licenses = [] | |
license = [] | |
inLicenseBlock = False | |
lastLinematch = None | |
for line in headerText.splitlines(): | |
linematch = line.lower().strip() | |
if lastLinematch == '': | |
inLicenseBlock = False | |
if inLicenseBlock: | |
license.append(line) | |
elif linematch.startswith("copyright"): | |
notices.append(line) | |
inLicenseBlock = False | |
if len(license) != 0: | |
licenses.append("\n".join(license)) | |
license = [] | |
elif linematch.startswith("all rights reserved"): | |
notices.append(line) | |
inLicenseBlock = False | |
if len(license) != 0: | |
licenses.append("\n".join(license)) | |
license = [] | |
elif linematch != '': | |
inLicenseBlock = True | |
license.append(line) | |
lastLinematch = linematch | |
if len(license) != 0: | |
licenses.append("\n".join(license)) | |
#print("Notices: {}, Licenses: {}".format(notices, licenses)) | |
return (notices, licenses) | |
def splitHeaders(headerBlocks): | |
notices = [] | |
licenses = [] | |
for block in headerBlocks: | |
n, l = splitHeader(block) | |
notices.extend(n) | |
licenses.extend(l) | |
return (notices, licenses) | |
def simpleStringMatch(stringA, stringB): | |
return re.sub(r"\s","", stringA.lower()) == \ | |
re.sub(r"\s","", stringB.lower()) | |
def mergeDuplicatesInPlace(inputList, compare_fn): | |
i = 0 | |
while i < len(inputList): | |
itemA = inputList[i] | |
inputList[i+1:] = [itemB for itemB in inputList[i+1:] if not compare_fn(itemA, itemB)] | |
i = i + 1 | |
def processFiles(filenames): | |
noticeList = [] | |
licenseList = [] | |
for name in filenames: | |
with open(name) as file: | |
try: | |
fileText = file.read() | |
except UnicodeDecodeError: | |
# Probably not a text file, so lets ignore it. | |
continue | |
filetype = getFiletype(name) | |
fileHeaderBlocks = findHeaders(fileText, filetype) | |
fileNotices, fileLicenses = splitHeaders(fileHeaderBlocks) | |
noticeList.extend(fileNotices) | |
licenseList.extend(fileLicenses) | |
mergeDuplicatesInPlace(noticeList, simpleStringMatch) | |
mergeDuplicatesInPlace(licenseList, simpleStringMatch) | |
return (noticeList, licenseList) | |
def printResults(noticeList, licenseList): | |
print("===== Copyright Notices =====\n{}\n\n===== Licenses =====\n{}".format(\ | |
"\n".join(noticeList), \ | |
"\n###\n".join(licenseList))) | |
pass | |
def main(args): | |
noticeList, licenseList = processFiles(args[1:]) | |
printResults(noticeList, licenseList) | |
if __name__ == "__main__": | |
main(sys.argv) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment