From 107bca31f6907856ffd919209ce2262b12724a95 Mon Sep 17 00:00:00 2001 From: "primiano@chromium.org" Date: Tue, 5 Aug 2014 21:50:44 +0000 Subject: [PATCH] Add --no-history option to fetch and gclient for shallow clones. Many people* have complained on chromium-dev about the long times required to perform a full fetch over a DSL. This seems to be mostly due to the huge size of chromium's history (~9 GB). On the other side, not everybody is interested in downloading the full git history of the projects. The size of git packs required to fetch a working HEAD is one order of magnitude smaller (1.5 GB). This change makes it possible to perform a shallow fetch (in a way which is consistent with DEPS, leveraging git templates on clone), reducing fetch times by 80% for those not interested in the history. * See: [chromium-dev] "fetch chromium" keeps hanging/getting stuck on Windows 7 [chromium-dev] Initial checkout with git taking long [chromium-dev] Trying to get latest source code fails when fetching [chromium-dev] Gclient sync takes too long BUG=228996 Review URL: https://codereview.chromium.org/437903002 git-svn-id: svn://svn.chromium.org/chrome/trunk/tools/depot_tools@287606 0039d316-1c4b-4281-b951-d872f2087c98 --- fetch.py | 12 ++++++++++- gclient.py | 3 +++ gclient_scm.py | 21 +++++++++++++++++++ gclient_utils.py | 5 +++++ tests/gclient_scm_test.py | 7 ++++--- tests/gclient_smoketest.py | 41 ++++++++++++++++++++++++++++++++++++++ 6 files changed, 85 insertions(+), 4 deletions(-) diff --git a/fetch.py b/fetch.py index d3bcacd79..897356f29 100755 --- a/fetch.py +++ b/fetch.py @@ -114,6 +114,8 @@ class GclientGitCheckout(GclientCheckout, GitCheckout): sync_cmd = ['sync'] if self.options.nohooks: sync_cmd.append('--nohooks') + if self.options.no_history: + sync_cmd.append('--no-history') if self.spec.get('with_branch_heads', False): sync_cmd.append('--with_branch_heads') self.run_gclient(*sync_cmd) @@ -207,6 +209,7 @@ Valid options: -h, --help, help Print this message. --nohooks Don't run hooks after checkout. -n, --dry-run Don't run commands, only print them. + --no-history Perform shallow clones, don't fetch the full git history. """ % os.path.basename(sys.argv[0])) sys.exit(bool(msg)) @@ -220,6 +223,7 @@ def handle_args(argv): dry_run = False nohooks = False + no_history = False while len(argv) >= 2: arg = argv[1] if not arg.startswith('-'): @@ -229,6 +233,8 @@ def handle_args(argv): dry_run = True elif arg == '--nohooks': nohooks = True + elif arg == '--no-history': + no_history = True else: usage('Invalid option %s.' % arg) @@ -241,7 +247,11 @@ def handle_args(argv): recipe = argv[1] props = argv[2:] - return optparse.Values({'dry_run':dry_run, 'nohooks':nohooks }), recipe, props + return ( + optparse.Values( + {'dry_run':dry_run, 'nohooks':nohooks, 'no_history': no_history }), + recipe, + props) def run_recipe_fetch(recipe, props, aliased=False): diff --git a/gclient.py b/gclient.py index b60884a0c..a297fba96 100755 --- a/gclient.py +++ b/gclient.py @@ -1808,6 +1808,9 @@ def CMDsync(parser, args): parser.add_option('--output-json', help='Output a json document to this path containing ' 'summary information about the sync.') + parser.add_option('--no-history', action='store_true', + help='GIT ONLY - Reduces the size/time of the checkout at ' + 'the cost of no history. Requires Git 1.9+') parser.add_option('--shallow', action='store_true', help='GIT ONLY - Do a shallow clone into the cache dir. ' 'Requires Git 1.9+') diff --git a/gclient_scm.py b/gclient_scm.py index a87032e74..df1054f11 100644 --- a/gclient_scm.py +++ b/gclient_scm.py @@ -825,6 +825,25 @@ class GitWrapper(SCMWrapper): # create it, so we need to do it manually. parent_dir = os.path.dirname(self.checkout_path) gclient_utils.safe_makedirs(parent_dir) + + template_dir = None + if options.no_history: + if gclient_utils.IsGitSha(revision): + # In the case of a subproject, the pinned sha is not necessarily the + # head of the remote branch (so we can't just use --depth=N). Instead, + # we tell git to fetch all the remote objects from SHA..HEAD by means of + # a template git dir which has a 'shallow' file pointing to the sha. + template_dir = tempfile.mkdtemp( + prefix='_gclient_gittmp_%s' % os.path.basename(self.checkout_path), + dir=parent_dir) + self._Run(['init', '--bare', template_dir], options, cwd=self._root_dir) + with open(os.path.join(template_dir, 'shallow'), 'w') as template_file: + template_file.write(revision) + clone_cmd.append('--template=' + template_dir) + else: + # Otherwise, we're just interested in the HEAD. Just use --depth. + clone_cmd.append('--depth=1') + tmp_dir = tempfile.mkdtemp( prefix='_gclient_%s_' % os.path.basename(self.checkout_path), dir=parent_dir) @@ -841,6 +860,8 @@ class GitWrapper(SCMWrapper): if os.listdir(tmp_dir): self.Print('_____ removing non-empty tmp dir %s' % tmp_dir) gclient_utils.rmtree(tmp_dir) + if template_dir: + gclient_utils.rmtree(template_dir) self._UpdateBranchHeads(options, fetch=True) self._Checkout(options, revision.replace('refs/heads/', ''), quiet=True) if self._GetCurrentBranch() is None: diff --git a/gclient_utils.py b/gclient_utils.py index 9c62a2a5b..f0d8cec6f 100644 --- a/gclient_utils.py +++ b/gclient_utils.py @@ -84,6 +84,11 @@ def SplitUrlRevision(url): return tuple(components) +def IsGitSha(revision): + """Returns true if the given string is a valid hex-encoded sha""" + return re.match('^[a-fA-F0-9]{6,40}$', revision) is not None + + def IsDateRevision(revision): """Returns true if the given revision is of the form "{ ... }".""" return bool(revision and re.match(r'^\{.+\}$', str(revision))) diff --git a/tests/gclient_scm_test.py b/tests/gclient_scm_test.py index e33ee0232..3dd175c69 100755 --- a/tests/gclient_scm_test.py +++ b/tests/gclient_scm_test.py @@ -704,10 +704,10 @@ class SVNWrapperTestCase(BaseTestCase): '--ignore-externals'], cwd=self.root_dir, file_list=[]) - + gclient_scm.scm.SVN._CaptureInfo([], self.base_path+'/.' ).AndReturn({'Revision': 100}) - + self.mox.ReplayAll() scm = self._scm_wrapper(url=self.url, root_dir=self.root_dir, relpath=self.relpath) @@ -784,6 +784,7 @@ class BaseGitWrapperTestCase(GCBaseTestCase, StdoutCheck, TestCaseUtils, self.force = False self.reset = False self.nohooks = False + self.no_history = False self.upstream = False self.cache_dir = None self.merge = False @@ -1205,7 +1206,7 @@ class ManagedGitWrapperTestCaseMox(BaseTestCase): def checkstdout(self, expected): value = sys.stdout.getvalue() - sys.stdout.close() + sys.stdout.close() # pylint: disable=E1101 self.assertEquals(expected, strip_timestamps(value)) diff --git a/tests/gclient_smoketest.py b/tests/gclient_smoketest.py index 3c56da8f5..eb2275e05 100755 --- a/tests/gclient_smoketest.py +++ b/tests/gclient_smoketest.py @@ -1179,6 +1179,47 @@ class GClientSmokeGITMutates(GClientSmokeBase): # files. self.assertEquals(0, len(out)) + def testSyncNoHistory(self): + if not self.enabled: + return + # Create an extra commit in repo_2 and point DEPS to its hash. + cur_deps = self.FAKE_REPOS.git_hashes['repo_1'][-1][1]['DEPS'] + repo_2_hash_old = self.FAKE_REPOS.git_hashes['repo_2'][1][0][:7] + self.FAKE_REPOS._commit_git('repo_2', { # pylint: disable=W0212 + 'last_file': 'file created in last commit', + }) + repo_2_hash_new = self.FAKE_REPOS.git_hashes['repo_2'][-1][0] + new_deps = cur_deps.replace(repo_2_hash_old, repo_2_hash_new) + self.assertNotEqual(new_deps, cur_deps) + self.FAKE_REPOS._commit_git('repo_1', { # pylint: disable=W0212 + 'DEPS': new_deps, + 'origin': 'git/repo_1@4\n', + }) + + config_template = ( +"""solutions = [{ +"name" : "src", +"url" : "%(git_base)srepo_1", +"deps_file" : "DEPS", +"managed" : True, +}]""") + + self.gclient(['config', '--spec', config_template % { + 'git_base': self.git_base + }]) + + self.gclient(['sync', '--no-history', '--deps', 'mac']) + repo2_root = join(self.root_dir, 'src', 'repo2') + + # Check that repo_2 is actually shallow and its log has only one entry. + rev_lists = subprocess2.check_output(['git', 'rev-list', 'HEAD'], + cwd=repo2_root) + self.assertEquals(repo_2_hash_new, rev_lists.strip('\r\n')) + + # Check that we have actually checked out the right commit. + self.assertTrue(os.path.exists(join(repo2_root, 'last_file'))) + + class GClientSmokeBoth(GClientSmokeBase): def setUp(self): super(GClientSmokeBoth, self).setUp()