License/authorship handling scripts

2020-09-17 15:55:15 -03:00 · 2020-09-17 15:55:15 -03:00 · c33040f0cf
commit c33040f0cf
parent b9d4bd0854
5 changed files with 478 additions and 0 deletions
--- a/tools/licensing/README.md
+++ b/tools/licensing/README.md
@ -0,0 +1,57 @@
+# Licensing Check Scripts
+
+This directory holds various scripts to aid in clearing IP on files. The main script is
+`log2json` which should receive a path to a file (relative to repository root). It will retrieve
+the file history and build a JSON output with all metadata needed for analysis. The second
+script is `check.py` which receives a JSON file generated from the previous tool (either from a file
+or from stdin, using `-` for the filename).
+
+The check script will:
+
+  1. retrieve git commit authors
+  2. parse commit message for possible attributions ("authored by: ...", among other variations)
+  3. retrieve file contents at each commit, parse the license header and try to extract authors
+     and companies (copyrights) listed there
+
+Steps 2 and 3 are based on heuristics. The attributions may not match the regular expressions
+used so there may be misdetections. Authors on headers are easier to detect. In fact, this will
+pick up various false positives (non-author strings) which will have to be ignored by the user.
+
+All of these authorship information is aggregated and in a final step, the names are used 
+to check for ICLAs, based on the ICLA databases (see below), which need to be manually downloaded.
+If a given author name is not matched, their email searched for in the `author_mappings.json` file,
+which is a dictionary of email to real name. This allows to handle users with alternative email
+addresses.
+
+The script output will report a green check if author matched the ICLA database or a red cross
+if not. Note that given the false positives in steps 2 and 3, there may be both non-author strings
+that obviously do not match and also there may be an attribution which was not detected in a commit
+message. The thorough approach would be to run the check script with verbosity ('-v') which will
+print the metadata of each commit, including the commit message. If double verbosity is used ('-vv'),
+the whole file will be printed, which allows to check the header.
+
+## Inaccessible blobs
+
+Since some files in the repositories lived during some part of their history in a separate repository
+(linked as a submodule to main repo), their blobs (basically the file at a given point in time)
+will not be accessible. This means that the file at that point in time cannot be accessed for analyzing
+its header.
+
+## Zero blob hash
+
+Some blob hashes will be all zeros, which means that the file was deleted at this point in time.
+Sometimes this is due to merges or renames (which may be part of the moving in and out of submodules).
+
+## ICLA database
+
+In order to retrieve a list of all users with CLAs,
+download the following files:
+
+  * https://whimsy.apache.org/public/icla-info.json
+  * https://whimsy.apache.org/public/icla-info_noid.json
+
+There are two files since not all users with CLAs have
+Apache IDs. These lists do not contain emails, but a
+manual search form is also here:
+
+  * https://whimsy.apache.org/roster/committer/
--- a/tools/licensing/apachize.py
+++ b/tools/licensing/apachize.py
@ -0,0 +1,46 @@
+#!/usr/bin/env python3
+
+import re
+import os
+import sys
+
+apache = r"""
+/****************************************************************************
+ * PATH
+ *
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.  The
+ * ASF licenses this file to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance with the
+ * License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  See the
+ * License for the specific language governing permissions and limitations
+ * under the License.
+ *
+ ****************************************************************************/"""
+
+def apachize(path, header):
+    relpath = os.path.relpath(path, os.environ['TOPDIR'])
+    header = re.sub('PATH', relpath, header)
+
+    with open(path) as f:
+        s = f.read()
+        s = re.sub('(?i)/\*\*\*.+?(?:Copyright).+?\*\*\*+/', header, s, 1, re.DOTALL)
+        print(s)
+
+if (len(sys.argv) != 2):
+    print("Usage: ./apachize.py <file>", file = sys.stderr)
+    print("This will replace the license header of the passed file to that of Apache 2.0 and print it to stdout", file = sys.sterr)
+    sys.exit(2)
+
+if (not 'TOPDIR' in os.environ):
+    print("Please define the TOPDIR environment variable to the full path to nuttx/ root", file = sys.stderr)
+    sys.exit(2)
+
+apachize(sys.argv[1], apache)
--- a/tools/licensing/author_mappings.json
+++ b/tools/licensing/author_mappings.json
@ -0,0 +1,11 @@
+{
+	"matias@imap.cc": "Matias Nitsche",
+	"patacongo@42af7a65-404d-4744-a932-0658087f49c3": "Gregory Nutt",
+	"spudaneco@gmail.com": "Gregory Nutt",
+	"gnutt@nuttx.org": "Gregory Nutt",
+	"gnutt@linux-qpx1.site": "Gregory Nutt",
+	"spudarnia@yahoo.com": "Gregory Nutt",
+	"59230071+hartmannathan@users.noreply.github.com": "Nathan Hartman",
+	"abdelatif.guettouche@gmail.com": "Abdelatif Guettouche",
+	"acassis@gmail.com": "Alan Carvalho de Assis"
+}
--- a/tools/licensing/check.py
+++ b/tools/licensing/check.py
@ -0,0 +1,287 @@
+#!/usr/bin/env python3
+
+############################################################################
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.  The
+# ASF licenses this file to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance with the
+# License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  See the
+# License for the specific language governing permissions and limitations
+# under the License.
+#
+############################################################################
+
+import json
+import sys
+import re
+import subprocess
+import getopt
+import termcolor
+import os
+
+committers_json = None
+non_commiters_json = None
+author_mappings_json = None
+
+verbose_level = 0
+color = True
+
+def colored(s,c):
+    if color:
+        return termcolor.colored(s,c)
+    else:
+        return s
+
+def commit_attributions(c):
+    regex = re.compile('(?i)(?:by|from|author|Co-authored-by):? +(.+)')
+    return re.findall(regex, c['message']) + re.findall(regex,c['body'])
+
+def get_headers(s):
+    return re.findall('(?i)/\*\*\*.+?(?:Copyright).+?\*\*\*+/', s, re.DOTALL)
+
+def get_file(blob):
+    try:
+        return subprocess.check_output(['git','cat-file','-p',blob], stderr=subprocess.DEVNULL).decode()
+    except subprocess.CalledProcessError as err:
+        return None
+
+def header_authors(header):
+    results = re.findall('[Aa]uthors?: +(.+?) *(?:Redistribution)', header, re.DOTALL)
+    results = [re.split('\n[ *]+',result) for result in results]
+    results = sum(results,[]) # flatten
+    results = [re.sub('[Cc]opyright:?( ?.[Cc].)? *([12][0-9]{3}[,-]? ?)','',result) for result in results]
+    results = list(filter(lambda s: s != '', results)) # remove empty strings
+    return results
+
+# Search for an author name in Apache's commiters/non-commiters
+# database. It will return (apacheID,name) if there's a match or
+# None if not. apacheID might be None if there's no Apache ID
+# for author
+
+def search_for_cla(name):
+    for k,v in committers_json['committers'].items():
+        if (v == name):
+            return (k,v)
+
+    if name in non_committers_json['non_committers']:
+        return (None,name)
+
+    return None
+
+# Returns the same as above, but this takes an author
+# (which may include an email include an email used
+# to look for alternative author names for this person)
+
+def author_has_cla(author):
+    if ('@' in author):
+        matches = re.match('^(.+?)(?: +([^ ]+@[^ ]+ *))$', author)
+        if (not matches):
+            return None # found an '@' but it wasn't an email, so this is most likely not really an author
+        name = matches.group(1)
+        email = matches.group(2).lstrip('<').rstrip('>')
+    else:
+        name = author.strip()
+        email = None
+
+    vvvprint('name: %s email: %s' % (name,email if email else '?'))
+
+    # first look for name directly
+
+    result = search_for_cla(name)
+    if (result):
+        return result
+    
+    # otherwise, get all available alternative names for author
+    # and look for each
+
+    if (email and (email in author_mappings_json)):
+        result = search_for_cla(author_mappings_json[email])
+        if (result):
+            return result
+            
+    # Nothing matched
+
+    return None
+
+def header_copyrights(header):
+    results = re.findall(' \* *[Cc]opyright:?(?: ?.[Cc].)? *(?:[12][0-9]{3}[,-]? ?)* *(.+)', header)
+    return [re.sub('(. )?[Aa]ll rights reserved.?','',result) for result in results]
+
+def report_cla(author):
+    cla = author_has_cla(author)
+    if cla:
+        (apacheid,name) = cla
+        print(colored('✓','green'), end = ' ')
+    else:
+        apacheid = None
+        print(colored('✗','red'), end = ' ')
+
+    if apacheid:
+        print('%s (ID: %s)' % (author, apacheid))
+    else:
+        print(author)
+
+def analyze(j):
+    complete_attributions = set()
+    complete_authors = set()
+    complete_copyrights = set()
+
+    vprint('file has %i commits' % len(j))
+    for commit in j:
+        authors = set()
+       
+        vprint(colored('-','yellow'))
+        vprint(colored('commit: ', 'green') + commit['commit'])
+        vprint(colored('blob: ', 'green') + commit['blob'])
+        vprint(colored('date: ','green') + commit['date'])
+        vprint(colored('author: ','green') + ('%s <%s>' % (commit['author'], commit['author-email'])))
+
+        attributions = commit_attributions(commit)
+        if (len(attributions) > 0):
+            vprint(colored('attributions:','green'))
+            for attribution in attributions:
+                vprint(attribution)
+
+        complete_attributions |= set(attributions)
+        complete_authors |= set([commit['author'] + ' ' + commit['author-email']])
+
+        # skip deletion commits 
+
+        vprint(colored('blob:','green'), end = ' ')
+        if (commit['blob'] == '0000000000000000000000000000000000000000'):
+            vprint('zero (deletion)')
+            continue 
+
+        file_contents = get_file(commit['blob'])
+
+        # skip inaccesible blobs (probably lived in a submodule)
+
+
+        if (not file_contents):
+            vprint('inaccessible')
+            continue
+        else:
+            vprint('available')
+
+        headers = get_headers(file_contents)
+
+        vprint(colored('header authors:','green'))
+        for header in headers:
+            ha = header_authors(header)
+            authors |= set(ha)
+            vprint(ha)
+
+        complete_authors |= set(authors)
+
+        vprint(colored('header copyrights:','green'))
+        copyrights = set()
+        for header in headers:
+            hc = header_copyrights(header)
+            copyrights |= set(hc)
+            vprint(hc)
+    
+        vprint(colored('commit description:','green'))
+        vprint(commit['message'])
+
+        if commit['body']:
+            vprint(colored('commit msg body:','green'))
+            vprint(commit['body'])
+
+        vvprint(colored('headers:','green'))
+        for header in headers:
+            vvprint(header)
+
+        complete_copyrights |= copyrights
+
+    vprint(colored("----\n",'yellow'))
+    
+    print(colored("COMPLETE REPORT:",'blue'))
+    print(colored("attributions:",'green'))
+    if (len(complete_attributions) == 0):
+        print("*none detected*")
+    else:
+        for attribution in complete_attributions:
+            report_cla(attribution)
+
+    print(colored("authors:",'green'))
+    for author in complete_authors:
+        report_cla(author)
+
+    print(colored("copyrights:",'green'))
+    print('\n'.join(complete_copyrights))
+
+def print_help():
+    print("Usage: check.py [-v] [-n] <JSON file>\n")
+    print("  -v\tIncrease verbosity (add up to three times)\n"
+          "  -n\tDo not use color for output")
+
+def vprint(*args, **kwargs):
+    if (verbose_level > 0):
+        print(*args, **kwargs)
+
+def vvprint(*args, **kwargs):
+    if (verbose_level > 1):
+        print(*args, **kwargs)
+
+def vvvprint(*args, **kwargs):
+    if (verbose_level > 2):
+        print(*args, **kwargs)
+
+#####
+
+# First try to load the CLAs JSONs:
+
+try:
+    with open(os.path.dirname(os.path.abspath(__file__)) + '/icla-info.json','r') as file:
+        committers_json = json.load(file)
+
+    with open(os.path.dirname(os.path.abspath(__file__)) + '/icla-info_noid.json','r') as file:
+        non_committers_json = json.load(file)
+except:
+    print('Could not open CLA JSON files, please read README.md for download instructions')
+    sys.exit(2)
+
+# Open author mappings JSON
+
+with open(os.path.dirname(os.path.abspath(__file__)) + '/author_mappings.json','r') as file:
+    author_mappings_json = json.load(file)
+
+try:
+    opts, args = getopt.getopt(sys.argv[1:], "hnv")
+except getopt.GetoptError:
+    print_help()
+    sys.exit(2)
+for opt, arg in opts:
+    if (opt == "-h"):
+        print_help()
+        sys.exit()
+    elif opt == "-v":
+        verbose_level = verbose_level + 1
+    elif opt == "-n":
+        color = False
+
+if (len(args) != 1):
+    print_help()
+    sys.exit(2)
+
+f = args[0]
+
+if not f:
+    print_help()
+    sys.exit(2)
+
+if (f == '-'):
+    j = json.load(sys.stdin)
+else:
+    with open(f, 'r') as file:
+       j = json.load(file)
+
+analyze(j)
--- a/tools/licensing/log2json.sh
+++ b/tools/licensing/log2json.sh
@ -0,0 +1,77 @@
+#!/usr/bin/env bash
+
+############################################################################
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.  The
+# ASF licenses this file to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance with the
+# License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  See the
+# License for the specific language governing permissions and limitations
+# under the License.
+#
+############################################################################
+
+# We define a replacement for the quote character (") 
+# since we cannot escape quote characters found inside 
+# the commit message
+
+Q='^@^'
+
+function getlog
+{
+  (echo -n '['
+   git --no-pager log --follow --simplify-merges \
+     --pretty=format:'{ '$Q'commit'$Q': '$Q'%H'$Q', '$Q'author'$Q': '$Q'%aN'$Q', '$Q'author-email'$Q': '$Q'%aE'$Q', '$Q'date'$Q': '$Q'%ad'$Q', '$Q'committer'$Q': '$Q'%cn'$Q', '$Q'committer-email'$Q': '$Q'%ce'$Q', '$Q'message'$Q': '$Q'%s'$Q', '$Q'body'$Q': '$Q'%b'$Q', '$Q'signed'$Q': '$Q'%G?'$Q', '$Q'signer'$Q': '$Q'%GS'$Q', '$Q'key'$Q': '$Q'%GK'$Q' },' -- "$1"
+  echo -n ']') | 
+    sed -r 's|\\|\\\\|g' |      # escape backquotes
+    sed -r 's|"|\\"|g' |        # replace quotes with escaped quotes
+    tr '\r\n' ' ' |             # replace newlines with spaces (otherwise strings) are broken
+    sed -r 's|\}, \{|},\n{|g' | # add newlines between entries, for readability
+    sed 's|\},\]|}]|g' |        # remove dangling comma at the final array entry
+    sed -r "s|\\^@\\^|\"|g"     # replace $Q with "
+}
+
+function getblobs
+{
+  echo -n '[';
+  git --no-pager log --pretty='' --no-abbrev --raw --follow --simplify-merges -- "$1" |
+    tr '\t' ' ' |
+    cut -f 4,6 -d ' ' |
+    sed -r 's|^(\S+) (\S+)$|{ "blob": "\1", "path": "\2" },|g' |
+    sed -r '$s|,$||g'  # remove dangling comma at the final array entry
+  echo -n ']'
+}
+
+if [ "$1" = "" ]; then
+	echo "usage: $(basename $0) <file>"
+	exit 1
+fi
+
+out1=$(mktemp)
+out2=$(mktemp)
+
+getlog "$1" > $out1
+getblobs "$1" > $out2
+
+if [ "$(jq '. | length' $out1)" != "$(jq '. | length' $out2)" ]; then
+	# TODO: handle this case, we get more than one blob when the file is moved
+	# to/from submodule and it difficults parsing. Also, the blob we get
+	# for when they file is in the submodule is not usable since it is possibly
+	# from the pointed repo.
+	echo "Log and blob list differ in size, probably a file which lived in a submodule"
+	echo "Log output is at: $out1"
+	echo "Blob output is at: $out2"
+	exit 1
+fi
+
+jq -s 'transpose | map(.[0] + .[1])' $out1 $out2
+
+rm -f $out1 $out2