diff --git a/tools/licensing/README.md b/tools/licensing/README.md new file mode 100644 index 0000000000..76066b6a9e --- /dev/null +++ b/tools/licensing/README.md @@ -0,0 +1,57 @@ +# Licensing Check Scripts + +This directory holds various scripts to aid in clearing IP on files. The main script is +`log2json` which should receive a path to a file (relative to repository root). It will retrieve +the file history and build a JSON output with all metadata needed for analysis. The second +script is `check.py` which receives a JSON file generated from the previous tool (either from a file +or from stdin, using `-` for the filename). + +The check script will: + + 1. retrieve git commit authors + 2. parse commit message for possible attributions ("authored by: ...", among other variations) + 3. retrieve file contents at each commit, parse the license header and try to extract authors + and companies (copyrights) listed there + +Steps 2 and 3 are based on heuristics. The attributions may not match the regular expressions +used so there may be misdetections. Authors on headers are easier to detect. In fact, this will +pick up various false positives (non-author strings) which will have to be ignored by the user. + +All of these authorship information is aggregated and in a final step, the names are used +to check for ICLAs, based on the ICLA databases (see below), which need to be manually downloaded. +If a given author name is not matched, their email searched for in the `author_mappings.json` file, +which is a dictionary of email to real name. This allows to handle users with alternative email +addresses. + +The script output will report a green check if author matched the ICLA database or a red cross +if not. Note that given the false positives in steps 2 and 3, there may be both non-author strings +that obviously do not match and also there may be an attribution which was not detected in a commit +message. The thorough approach would be to run the check script with verbosity ('-v') which will +print the metadata of each commit, including the commit message. If double verbosity is used ('-vv'), +the whole file will be printed, which allows to check the header. + +## Inaccessible blobs + +Since some files in the repositories lived during some part of their history in a separate repository +(linked as a submodule to main repo), their blobs (basically the file at a given point in time) +will not be accessible. This means that the file at that point in time cannot be accessed for analyzing +its header. + +## Zero blob hash + +Some blob hashes will be all zeros, which means that the file was deleted at this point in time. +Sometimes this is due to merges or renames (which may be part of the moving in and out of submodules). + +## ICLA database + +In order to retrieve a list of all users with CLAs, +download the following files: + + * https://whimsy.apache.org/public/icla-info.json + * https://whimsy.apache.org/public/icla-info_noid.json + +There are two files since not all users with CLAs have +Apache IDs. These lists do not contain emails, but a +manual search form is also here: + + * https://whimsy.apache.org/roster/committer/ diff --git a/tools/licensing/apachize.py b/tools/licensing/apachize.py new file mode 100755 index 0000000000..7c26e6350a --- /dev/null +++ b/tools/licensing/apachize.py @@ -0,0 +1,46 @@ +#!/usr/bin/env python3 + +import re +import os +import sys + +apache = r""" +/**************************************************************************** + * PATH + * + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. The + * ASF licenses this file to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the + * License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the + * License for the specific language governing permissions and limitations + * under the License. + * + ****************************************************************************/""" + +def apachize(path, header): + relpath = os.path.relpath(path, os.environ['TOPDIR']) + header = re.sub('PATH', relpath, header) + + with open(path) as f: + s = f.read() + s = re.sub('(?i)/\*\*\*.+?(?:Copyright).+?\*\*\*+/', header, s, 1, re.DOTALL) + print(s) + +if (len(sys.argv) != 2): + print("Usage: ./apachize.py ", file = sys.stderr) + print("This will replace the license header of the passed file to that of Apache 2.0 and print it to stdout", file = sys.sterr) + sys.exit(2) + +if (not 'TOPDIR' in os.environ): + print("Please define the TOPDIR environment variable to the full path to nuttx/ root", file = sys.stderr) + sys.exit(2) + +apachize(sys.argv[1], apache) diff --git a/tools/licensing/author_mappings.json b/tools/licensing/author_mappings.json new file mode 100644 index 0000000000..5c9fe8e897 --- /dev/null +++ b/tools/licensing/author_mappings.json @@ -0,0 +1,11 @@ +{ + "matias@imap.cc": "Matias Nitsche", + "patacongo@42af7a65-404d-4744-a932-0658087f49c3": "Gregory Nutt", + "spudaneco@gmail.com": "Gregory Nutt", + "gnutt@nuttx.org": "Gregory Nutt", + "gnutt@linux-qpx1.site": "Gregory Nutt", + "spudarnia@yahoo.com": "Gregory Nutt", + "59230071+hartmannathan@users.noreply.github.com": "Nathan Hartman", + "abdelatif.guettouche@gmail.com": "Abdelatif Guettouche", + "acassis@gmail.com": "Alan Carvalho de Assis" +} diff --git a/tools/licensing/check.py b/tools/licensing/check.py new file mode 100755 index 0000000000..fa09e1d152 --- /dev/null +++ b/tools/licensing/check.py @@ -0,0 +1,287 @@ +#!/usr/bin/env python3 + +############################################################################ +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. The +# ASF licenses this file to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance with the +# License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +# License for the specific language governing permissions and limitations +# under the License. +# +############################################################################ + +import json +import sys +import re +import subprocess +import getopt +import termcolor +import os + +committers_json = None +non_commiters_json = None +author_mappings_json = None + +verbose_level = 0 +color = True + +def colored(s,c): + if color: + return termcolor.colored(s,c) + else: + return s + +def commit_attributions(c): + regex = re.compile('(?i)(?:by|from|author|Co-authored-by):? +(.+)') + return re.findall(regex, c['message']) + re.findall(regex,c['body']) + +def get_headers(s): + return re.findall('(?i)/\*\*\*.+?(?:Copyright).+?\*\*\*+/', s, re.DOTALL) + +def get_file(blob): + try: + return subprocess.check_output(['git','cat-file','-p',blob], stderr=subprocess.DEVNULL).decode() + except subprocess.CalledProcessError as err: + return None + +def header_authors(header): + results = re.findall('[Aa]uthors?: +(.+?) *(?:Redistribution)', header, re.DOTALL) + results = [re.split('\n[ *]+',result) for result in results] + results = sum(results,[]) # flatten + results = [re.sub('[Cc]opyright:?( ?.[Cc].)? *([12][0-9]{3}[,-]? ?)','',result) for result in results] + results = list(filter(lambda s: s != '', results)) # remove empty strings + return results + +# Search for an author name in Apache's commiters/non-commiters +# database. It will return (apacheID,name) if there's a match or +# None if not. apacheID might be None if there's no Apache ID +# for author + +def search_for_cla(name): + for k,v in committers_json['committers'].items(): + if (v == name): + return (k,v) + + if name in non_committers_json['non_committers']: + return (None,name) + + return None + +# Returns the same as above, but this takes an author +# (which may include an email include an email used +# to look for alternative author names for this person) + +def author_has_cla(author): + if ('@' in author): + matches = re.match('^(.+?)(?: +([^ ]+@[^ ]+ *))$', author) + if (not matches): + return None # found an '@' but it wasn't an email, so this is most likely not really an author + name = matches.group(1) + email = matches.group(2).lstrip('<').rstrip('>') + else: + name = author.strip() + email = None + + vvvprint('name: %s email: %s' % (name,email if email else '?')) + + # first look for name directly + + result = search_for_cla(name) + if (result): + return result + + # otherwise, get all available alternative names for author + # and look for each + + if (email and (email in author_mappings_json)): + result = search_for_cla(author_mappings_json[email]) + if (result): + return result + + # Nothing matched + + return None + +def header_copyrights(header): + results = re.findall(' \* *[Cc]opyright:?(?: ?.[Cc].)? *(?:[12][0-9]{3}[,-]? ?)* *(.+)', header) + return [re.sub('(. )?[Aa]ll rights reserved.?','',result) for result in results] + +def report_cla(author): + cla = author_has_cla(author) + if cla: + (apacheid,name) = cla + print(colored('✓','green'), end = ' ') + else: + apacheid = None + print(colored('✗','red'), end = ' ') + + if apacheid: + print('%s (ID: %s)' % (author, apacheid)) + else: + print(author) + +def analyze(j): + complete_attributions = set() + complete_authors = set() + complete_copyrights = set() + + vprint('file has %i commits' % len(j)) + for commit in j: + authors = set() + + vprint(colored('-','yellow')) + vprint(colored('commit: ', 'green') + commit['commit']) + vprint(colored('blob: ', 'green') + commit['blob']) + vprint(colored('date: ','green') + commit['date']) + vprint(colored('author: ','green') + ('%s <%s>' % (commit['author'], commit['author-email']))) + + attributions = commit_attributions(commit) + if (len(attributions) > 0): + vprint(colored('attributions:','green')) + for attribution in attributions: + vprint(attribution) + + complete_attributions |= set(attributions) + complete_authors |= set([commit['author'] + ' ' + commit['author-email']]) + + # skip deletion commits + + vprint(colored('blob:','green'), end = ' ') + if (commit['blob'] == '0000000000000000000000000000000000000000'): + vprint('zero (deletion)') + continue + + file_contents = get_file(commit['blob']) + + # skip inaccesible blobs (probably lived in a submodule) + + + if (not file_contents): + vprint('inaccessible') + continue + else: + vprint('available') + + headers = get_headers(file_contents) + + vprint(colored('header authors:','green')) + for header in headers: + ha = header_authors(header) + authors |= set(ha) + vprint(ha) + + complete_authors |= set(authors) + + vprint(colored('header copyrights:','green')) + copyrights = set() + for header in headers: + hc = header_copyrights(header) + copyrights |= set(hc) + vprint(hc) + + vprint(colored('commit description:','green')) + vprint(commit['message']) + + if commit['body']: + vprint(colored('commit msg body:','green')) + vprint(commit['body']) + + vvprint(colored('headers:','green')) + for header in headers: + vvprint(header) + + complete_copyrights |= copyrights + + vprint(colored("----\n",'yellow')) + + print(colored("COMPLETE REPORT:",'blue')) + print(colored("attributions:",'green')) + if (len(complete_attributions) == 0): + print("*none detected*") + else: + for attribution in complete_attributions: + report_cla(attribution) + + print(colored("authors:",'green')) + for author in complete_authors: + report_cla(author) + + print(colored("copyrights:",'green')) + print('\n'.join(complete_copyrights)) + +def print_help(): + print("Usage: check.py [-v] [-n] \n") + print(" -v\tIncrease verbosity (add up to three times)\n" + " -n\tDo not use color for output") + +def vprint(*args, **kwargs): + if (verbose_level > 0): + print(*args, **kwargs) + +def vvprint(*args, **kwargs): + if (verbose_level > 1): + print(*args, **kwargs) + +def vvvprint(*args, **kwargs): + if (verbose_level > 2): + print(*args, **kwargs) + +##### + +# First try to load the CLAs JSONs: + +try: + with open(os.path.dirname(os.path.abspath(__file__)) + '/icla-info.json','r') as file: + committers_json = json.load(file) + + with open(os.path.dirname(os.path.abspath(__file__)) + '/icla-info_noid.json','r') as file: + non_committers_json = json.load(file) +except: + print('Could not open CLA JSON files, please read README.md for download instructions') + sys.exit(2) + +# Open author mappings JSON + +with open(os.path.dirname(os.path.abspath(__file__)) + '/author_mappings.json','r') as file: + author_mappings_json = json.load(file) + +try: + opts, args = getopt.getopt(sys.argv[1:], "hnv") +except getopt.GetoptError: + print_help() + sys.exit(2) +for opt, arg in opts: + if (opt == "-h"): + print_help() + sys.exit() + elif opt == "-v": + verbose_level = verbose_level + 1 + elif opt == "-n": + color = False + +if (len(args) != 1): + print_help() + sys.exit(2) + +f = args[0] + +if not f: + print_help() + sys.exit(2) + +if (f == '-'): + j = json.load(sys.stdin) +else: + with open(f, 'r') as file: + j = json.load(file) + +analyze(j) diff --git a/tools/licensing/log2json.sh b/tools/licensing/log2json.sh new file mode 100755 index 0000000000..56a40eaab5 --- /dev/null +++ b/tools/licensing/log2json.sh @@ -0,0 +1,77 @@ +#!/usr/bin/env bash + +############################################################################ +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. The +# ASF licenses this file to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance with the +# License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +# License for the specific language governing permissions and limitations +# under the License. +# +############################################################################ + +# We define a replacement for the quote character (") +# since we cannot escape quote characters found inside +# the commit message + +Q='^@^' + +function getlog +{ + (echo -n '[' + git --no-pager log --follow --simplify-merges \ + --pretty=format:'{ '$Q'commit'$Q': '$Q'%H'$Q', '$Q'author'$Q': '$Q'%aN'$Q', '$Q'author-email'$Q': '$Q'%aE'$Q', '$Q'date'$Q': '$Q'%ad'$Q', '$Q'committer'$Q': '$Q'%cn'$Q', '$Q'committer-email'$Q': '$Q'%ce'$Q', '$Q'message'$Q': '$Q'%s'$Q', '$Q'body'$Q': '$Q'%b'$Q', '$Q'signed'$Q': '$Q'%G?'$Q', '$Q'signer'$Q': '$Q'%GS'$Q', '$Q'key'$Q': '$Q'%GK'$Q' },' -- "$1" + echo -n ']') | + sed -r 's|\\|\\\\|g' | # escape backquotes + sed -r 's|"|\\"|g' | # replace quotes with escaped quotes + tr '\r\n' ' ' | # replace newlines with spaces (otherwise strings) are broken + sed -r 's|\}, \{|},\n{|g' | # add newlines between entries, for readability + sed 's|\},\]|}]|g' | # remove dangling comma at the final array entry + sed -r "s|\\^@\\^|\"|g" # replace $Q with " +} + +function getblobs +{ + echo -n '['; + git --no-pager log --pretty='' --no-abbrev --raw --follow --simplify-merges -- "$1" | + tr '\t' ' ' | + cut -f 4,6 -d ' ' | + sed -r 's|^(\S+) (\S+)$|{ "blob": "\1", "path": "\2" },|g' | + sed -r '$s|,$||g' # remove dangling comma at the final array entry + echo -n ']' +} + +if [ "$1" = "" ]; then + echo "usage: $(basename $0) " + exit 1 +fi + +out1=$(mktemp) +out2=$(mktemp) + +getlog "$1" > $out1 +getblobs "$1" > $out2 + +if [ "$(jq '. | length' $out1)" != "$(jq '. | length' $out2)" ]; then + # TODO: handle this case, we get more than one blob when the file is moved + # to/from submodule and it difficults parsing. Also, the blob we get + # for when they file is in the submodule is not usable since it is possibly + # from the pointed repo. + echo "Log and blob list differ in size, probably a file which lived in a submodule" + echo "Log output is at: $out1" + echo "Blob output is at: $out2" + exit 1 +fi + +jq -s 'transpose | map(.[0] + .[1])' $out1 $out2 + +rm -f $out1 $out2