depot_tools/recipes/recipe_modules/gitiles/resources/gerrit_client.py

#!/usr/bin/python
# Copyright 2014 The Chromium Authors. All rights reserved.
# Use of this source code is governed by a BSD-style license that can be
# found in the LICENSE file.

"""Simple client for the Gerrit REST API.

Example usage:
  ./gerrit_client.py -j /tmp/out.json -f json \
    -u https://chromium.googlesource.com/chromium/src/+log
"""

import argparse
import json
import logging
import os
import sys
import tarfile
import time
import urllib
import urlparse

DEPOT_TOOLS = os.path.abspath(
    os.path.join(os.path.dirname(__file__), os.pardir, os.pardir, os.pardir,
                 os.pardir))
sys.path.insert(0, DEPOT_TOOLS)

from gerrit_util import CreateHttpConn, ReadHttpResponse, ReadHttpJsonResponse


def reparse_url(parsed_url, query_params):
  return urlparse.ParseResult(
      scheme=parsed_url.scheme,
      netloc=parsed_url.netloc,
      path=parsed_url.path,
      params=parsed_url.params,
      fragment=parsed_url.fragment,
      query=urllib.urlencode(query_params, doseq=True))


def gitiles_get(parsed_url, handler, attempts):
  # This insanity is due to CreateHttpConn interface :(
  host = parsed_url.netloc
  path = parsed_url.path
  if parsed_url.query:
    path += '?%s' % (parsed_url.query, )

  retry_delay_seconds = 1
  attempt = 1
  while True:
    try:
      return handler(CreateHttpConn(host, path))
    except Exception as e:
      if attempt >= attempts:
        raise
      logging.exception('Failed to perform Gitiles operation: %s', e)

    # Retry from previous loop.
    logging.error('Sleeping %d seconds before retry (%d/%d)...',
                  retry_delay_seconds, attempt, attempts)
    time.sleep(retry_delay_seconds)
    retry_delay_seconds *= 2
    attempt += 1


def fetch_log_with_paging(query_params, limit, fetch):
  """Fetches log, possibly requesting multiple pages to do so.

  Args:
    query_params (dict): Parameters to use in the request.
    limit (int): Page size.
    fetch (function): Function to use to make the requests.

  Returns:
    Dict with key "log", whose value is a list of commits.
  """
  # Log api returns {'log': [list of commits], 'next': hash}.
  last_result = fetch(query_params)
  commits = last_result['log']
  while last_result.get('next') and len(commits) < limit:
    query_params['s'] = last_result.get('next')
    last_result = fetch(query_params)
    # The first commit in `last_result` is not necessarily the parent of the
    # last commit in result so far!  This is because log command can be done on
    # one file object, for example:
    #   https://gerrit.googlesource.com/gitiles/+log/1c21279f337da8130/COPYING
    # Even when getting log for the whole repository, there could be merge
    # commits.
    commits.extend(last_result['log'])
  # Use 'next' field (if any) from `last_result`, but commits aggregated
  # from all the results. This essentially imitates paging with at least
  # `limit` page size.
  last_result['log'] = commits
  logging.debug(
      'fetched %d commits, next: %s.', len(commits),
      last_result.get('next'))
  return last_result


def main(arguments):
  parser = create_argparser()
  args = parser.parse_args(arguments)

  if args.extract_to and args.format != "archive":
    parser.error('--extract-to requires --format=archive')
  if not args.extract_to and args.format == "archive":
    parser.error('--format=archive requires --extract-to')

  if args.extract_to:
    # make sure it is absolute and ends with '/'
    args.extract_to = os.path.join(os.path.abspath(args.extract_to), '')
    os.makedirs(args.extract_to)

  parsed_url = urlparse.urlparse(args.url)
  if not parsed_url.scheme.startswith('http'):
    parser.error('Invalid URI scheme (expected http or https): %s' % args.url)

  query_params = {}
  if parsed_url.query:
    query_params.update(urlparse.parse_qs(parsed_url.query))
  # Force the format specified on command-line.
  if query_params.get('format'):
    parser.error('URL must not contain format; use --format command line flag '
                 'instead.')
  query_params['format'] = args.format

  kwargs = {}
  accept_statuses = frozenset([int(s) for s in args.accept_statuses.split(',')])
  if accept_statuses:
    kwargs['accept_statuses'] = accept_statuses

  # Choose handler.
  if args.format == 'json':
    def handler(conn):
      return ReadHttpJsonResponse(conn, **kwargs)
  elif args.format == 'text':
    # Text fetching will pack the text into structured JSON.
    def handler(conn):
      # Wrap in a structured JSON for export to recipe module.
      return {
        'value': ReadHttpResponse(conn, **kwargs).read() or None,
      }
  elif args.format == 'archive':
    # Archive fetching hooks result to tarfile extraction. This implementation
    # is able to do a streaming extraction operation without having to buffer
    # the entire tarfile.
    def handler(conn):
      ret = {
        'extracted': {
          'filecount': 0,
          'bytes': 0,
        },
        'skipped': {
          'filecount': 0,
          'bytes': 0,
          'names': [],
        }
      }
      fileobj = ReadHttpResponse(conn, **kwargs)
      with tarfile.open(mode='r|*', fileobj=fileobj) as tf:
        # monkeypatch the TarFile object to allow printing messages and
        # collecting stats for each extracted file. extractall makes a single
        # linear pass over the tarfile, which is compatible with
        # ReadHttpResponse; other naive implementations (such as `getmembers`)
        # do random access over the file and would require buffering the whole
        # thing (!!).
        em = tf._extract_member
        def _extract_member(tarinfo, targetpath):
          if not os.path.abspath(targetpath).startswith(args.extract_to):
            print 'Skipping %s' % (tarinfo.name,)
            ret['skipped']['filecount'] += 1
            ret['skipped']['bytes'] += tarinfo.size
            ret['skipped']['names'].append(tarinfo.name)
            return
          print 'Extracting %s' % (tarinfo.name,)
          ret['extracted']['filecount'] += 1
          ret['extracted']['bytes'] += tarinfo.size
          return em(tarinfo, targetpath)
        tf._extract_member = _extract_member
        tf.extractall(args.extract_to)
      return ret

  if args.log_start:
    query_params['s'] = args.log_start

  def fetch(query_params):
    parsed_url_with_query = reparse_url(parsed_url, query_params)
    result = gitiles_get(parsed_url_with_query, handler, args.attempts)
    if not args.quiet:
      logging.info('Read from %s: %s', parsed_url_with_query.geturl(), result)
    return result

  if args.log_limit:
    if args.format != 'json':
      parser.error('--log-limit works with json format only')
    result = fetch_log_with_paging(query_params, args.log_limit, fetch)
  else:
    # Either not a log request, or don't care about paging.
    # So, just return whatever is fetched the first time.
    result = fetch(query_params)

  with open(args.json_file, 'w') as json_file:
    json.dump(result, json_file)
  return 0


def create_argparser():
  parser = argparse.ArgumentParser()
  parser.add_argument(
      '-j', '--json-file',
      help='Path to json file for output.')
  parser.add_argument(
      '--extract-to',
      help='Local path to extract archive url. Must not exist.')
  parser.add_argument(
      '-f', '--format', required=True, choices=('json', 'text', 'archive'))
  parser.add_argument(
      '-u', '--url', required=True,
      help='Url of gitiles. For example, '
           'https://chromium.googlesource.com/chromium/src/+refs. '
           'Insert a/ after domain for authenticated access.')
  parser.add_argument(
      '-a', '--attempts', type=int, default=1,
      help='The number of attempts to make (with exponential backoff) before '
           'failing. If several requests are to be made, applies per each '
           'request separately.')
  parser.add_argument(
      '-q', '--quiet', action='store_true',
      help='Suppress file contents logging output.')
  parser.add_argument(
      '--log-limit', type=int, default=None,
      help='Follow gitiles pages to fetch at least this many commits. By '
           'default, first page with unspecified number of commits is fetched. '
           'Only for https://<hostname>/<repo>/+log/... gitiles request.')
  parser.add_argument(
      '--log-start',
      help='If given, continue fetching log by paging from this commit hash. '
           'This value can be typically be taken from json result of previous '
           'call to log, which returns next page start commit as "next" key. '
           'Only for https://<hostname>/<repo>/+log/... gitiles request.')
  parser.add_argument(
      '--accept-statuses', type=str, default='200',
      help='Comma-separated list of Status codes to accept as "successful" '
           'HTTP responses.')
  return parser


if __name__ == '__main__':
  logging.basicConfig()
  logging.getLogger().setLevel(logging.INFO)
  sys.exit(main(sys.argv[1:]))