From 07bfa0dd6549d85af323bacbdcee84257a357fd8 Mon Sep 17 00:00:00 2001 From: Bruce Dawson Date: Wed, 13 Apr 2022 20:40:23 +0000 Subject: [PATCH] Make CheckLicense ~60x faster After optimizing CheckForIncludeGuards the CheckLicense presubmit was the second slowest presubmit, taking about 35 minutes when run as part of "git cl presubmit --all". The cause of its slowness was initially non-obvious because it runs quite quickly on most files, however analysis showed that it could take 50+ seconds on some files. The files that it is slow on are those that lack a good license header, meaning that the regex match has to painstakingly scan the entire file. The optimization in this change is to recognize that there is a simple non-regex line that appears in all valid license headers, regardless of variants. If that line is absent then there is, necessarily, no valid license header, and searching for a line of text is something that Python can do extremely quickly. This change drops the CheckLicense time from about 35 minutes to about 32 seconds. Trivia: _CommonChecks in third_party/blink/PRESUBMIT.py passes .* as the license, so I added an early-out for that to avoid pointlessly scanning those files. Bug: 1309977 Change-Id: Ic2e56079675c2c5a2643d20dd492b1cc52e4ead2 Reviewed-on: https://chromium-review.googlesource.com/c/chromium/tools/depot_tools/+/3584882 Reviewed-by: Erik Staab Reviewed-by: Gavin Mak Commit-Queue: Bruce Dawson --- presubmit_canned_checks.py | 57 ++++++++++++++++++++++++-------------- 1 file changed, 36 insertions(+), 21 deletions(-) diff --git a/presubmit_canned_checks.py b/presubmit_canned_checks.py index 63bba80e2..1d7499e25 100644 --- a/presubmit_canned_checks.py +++ b/presubmit_canned_checks.py @@ -595,26 +595,38 @@ def CheckLicense(input_api, output_api, license_re=None, project_name=None, """Verifies the license header. """ - project_name = project_name or 'Chromium' - - # Accept any year number from 2006 to the current year, or the special - # 2006-20xx string used on the oldest files. 2006-20xx is deprecated, but - # tolerated on old files. - current_year = int(input_api.time.strftime('%Y')) - allowed_years = (str(s) for s in reversed(range(2006, current_year + 1))) - years_re = '(' + '|'.join(allowed_years) + '|2006-2008|2006-2009|2006-2010)' - - # The (c) is deprecated, but tolerate it until it's removed from all files. - license_re = license_re or ( - r'.*? Copyright (\(c\) )?%(year)s The %(project)s Authors\. ' - r'All rights reserved\.\r?\n' - r'.*? Use of this source code is governed by a BSD-style license that ' - r'can be\r?\n' - r'.*? found in the LICENSE file\.(?: \*/)?\r?\n' - ) % { - 'year': years_re, - 'project': project_name, - } + # Early-out if the license_re is guaranteed to match everything. + if license_re and license_re == '.*': + return [] + + key_line = None + if not license_re: + project_name = project_name or 'Chromium' + + # Accept any year number from 2006 to the current year, or the special + # 2006-20xx string used on the oldest files. 2006-20xx is deprecated, but + # tolerated on old files. + current_year = int(input_api.time.strftime('%Y')) + allowed_years = (str(s) for s in reversed(range(2006, current_year + 1))) + years_re = '(' + '|'.join(allowed_years) + '|2006-2008|2006-2009|2006-2010)' + + # A file that lacks this line necessarily lacks a compatible license. + # Checking for this line lets us avoid the cost of a complex regex across a + # possibly large file. This has been seen to save 50+ seconds on a single + # file. + key_line = ('Use of this source code is governed by a BSD-style license ' + 'that can be') + # The (c) is deprecated, but tolerate it until it's removed from all files. + license_re = ( + r'.*? Copyright (\(c\) )?%(year)s The %(project)s Authors\. ' + r'All rights reserved\.\r?\n' + r'.*? %(key_line)s\r?\n' + r'.*? found in the LICENSE file\.(?: \*/)?\r?\n' + ) % { + 'year': years_re, + 'project': project_name, + 'key_line' : key_line, + } license_re = input_api.re.compile(license_re, input_api.re.MULTILINE) bad_files = [] @@ -622,7 +634,10 @@ def CheckLicense(input_api, output_api, license_re=None, project_name=None, contents = input_api.ReadFile(f, 'r') if accept_empty_files and not contents: continue - if not license_re.search(contents): + # Search for key_line first to avoid fruitless and expensive regex searches. + if (key_line and not key_line in contents): + bad_files.append(f.LocalPath()) + elif not license_re.search(contents): bad_files.append(f.LocalPath()) if bad_files: return [output_api.PresubmitPromptWarning(