Make CheckLicense ~60x faster

After optimizing CheckForIncludeGuards the CheckLicense presubmit was
the second slowest presubmit, taking about 35 minutes when run as part
of "git cl presubmit --all". The cause of its slowness was initially
non-obvious because it runs quite quickly on most files, however
analysis showed that it could take 50+ seconds on some files. The files
that it is slow on are those that lack a good license header, meaning
that the regex match has to painstakingly scan the entire file.

The optimization in this change is to recognize that there is a simple
non-regex line that appears in all valid license headers, regardless of
variants. If that line is absent then there is, necessarily, no valid
license header, and searching for a line of text is something that
Python can do extremely quickly.

This change drops the CheckLicense time from about 35 minutes to about
32 seconds.

Trivia: _CommonChecks in third_party/blink/PRESUBMIT.py passes .* as the
license, so I added an early-out for that to avoid pointlessly scanning
those files.

Bug: 1309977
Change-Id: Ic2e56079675c2c5a2643d20dd492b1cc52e4ead2
Reviewed-on: https://chromium-review.googlesource.com/c/chromium/tools/depot_tools/+/3584882
Reviewed-by: Erik Staab <estaab@chromium.org>
Reviewed-by: Gavin Mak <gavinmak@google.com>
Commit-Queue: Bruce Dawson <brucedawson@chromium.org>
changes/82/3584882/7
Bruce Dawson 3 years ago committed by LUCI CQ
parent 3b97fa826e
commit 07bfa0dd65

@ -595,26 +595,38 @@ def CheckLicense(input_api, output_api, license_re=None, project_name=None,
"""Verifies the license header. """Verifies the license header.
""" """
project_name = project_name or 'Chromium' # Early-out if the license_re is guaranteed to match everything.
if license_re and license_re == '.*':
# Accept any year number from 2006 to the current year, or the special return []
# 2006-20xx string used on the oldest files. 2006-20xx is deprecated, but
# tolerated on old files. key_line = None
current_year = int(input_api.time.strftime('%Y')) if not license_re:
allowed_years = (str(s) for s in reversed(range(2006, current_year + 1))) project_name = project_name or 'Chromium'
years_re = '(' + '|'.join(allowed_years) + '|2006-2008|2006-2009|2006-2010)'
# Accept any year number from 2006 to the current year, or the special
# The (c) is deprecated, but tolerate it until it's removed from all files. # 2006-20xx string used on the oldest files. 2006-20xx is deprecated, but
license_re = license_re or ( # tolerated on old files.
r'.*? Copyright (\(c\) )?%(year)s The %(project)s Authors\. ' current_year = int(input_api.time.strftime('%Y'))
r'All rights reserved\.\r?\n' allowed_years = (str(s) for s in reversed(range(2006, current_year + 1)))
r'.*? Use of this source code is governed by a BSD-style license that ' years_re = '(' + '|'.join(allowed_years) + '|2006-2008|2006-2009|2006-2010)'
r'can be\r?\n'
r'.*? found in the LICENSE file\.(?: \*/)?\r?\n' # A file that lacks this line necessarily lacks a compatible license.
) % { # Checking for this line lets us avoid the cost of a complex regex across a
'year': years_re, # possibly large file. This has been seen to save 50+ seconds on a single
'project': project_name, # file.
} key_line = ('Use of this source code is governed by a BSD-style license '
'that can be')
# The (c) is deprecated, but tolerate it until it's removed from all files.
license_re = (
r'.*? Copyright (\(c\) )?%(year)s The %(project)s Authors\. '
r'All rights reserved\.\r?\n'
r'.*? %(key_line)s\r?\n'
r'.*? found in the LICENSE file\.(?: \*/)?\r?\n'
) % {
'year': years_re,
'project': project_name,
'key_line' : key_line,
}
license_re = input_api.re.compile(license_re, input_api.re.MULTILINE) license_re = input_api.re.compile(license_re, input_api.re.MULTILINE)
bad_files = [] bad_files = []
@ -622,7 +634,10 @@ def CheckLicense(input_api, output_api, license_re=None, project_name=None,
contents = input_api.ReadFile(f, 'r') contents = input_api.ReadFile(f, 'r')
if accept_empty_files and not contents: if accept_empty_files and not contents:
continue continue
if not license_re.search(contents): # Search for key_line first to avoid fruitless and expensive regex searches.
if (key_line and not key_line in contents):
bad_files.append(f.LocalPath())
elif not license_re.search(contents):
bad_files.append(f.LocalPath()) bad_files.append(f.LocalPath())
if bad_files: if bad_files:
return [output_api.PresubmitPromptWarning( return [output_api.PresubmitPromptWarning(

Loading…
Cancel
Save