diff options
author | Konstantin Ryabitsev <konstantin@linuxfoundation.org> | 2020-07-07 17:28:56 -0400 |
---|---|---|
committer | Konstantin Ryabitsev <konstantin@linuxfoundation.org> | 2020-07-07 17:28:56 -0400 |
commit | 549cfc2c040fd946641a9bd81a04a0695b811f50 (patch) | |
tree | 3ac8ff751970961a552daf4fb5d9cf32c0933ba7 | |
parent | 227a53004325ceeb24fef4cb9e5c090eb4e5ac77 (diff) | |
download | grokmirror-549cfc2c040fd946641a9bd81a04a0695b811f50.tar.gz |
Add support for retries
Cloning 28,000 repositories to China brings out a whole bunch of ugly
things to the surface, such as intermittent TCP/HTTP resets. Implement
retrying if pull is unsuccessful, up to a defined number of times.
Additionally, a mix of various fixes.
Signed-off-by: Konstantin Ryabitsev <konstantin@linuxfoundation.org>
-rw-r--r-- | grokmirror.conf | 4 | ||||
-rw-r--r-- | grokmirror/__init__.py | 71 | ||||
-rwxr-xr-x | grokmirror/fsck.py | 18 | ||||
-rwxr-xr-x | grokmirror/manifest.py | 104 | ||||
-rwxr-xr-x | grokmirror/pull.py | 46 |
5 files changed, 149 insertions, 94 deletions
diff --git a/grokmirror.conf b/grokmirror.conf index 116a137..c88792c 100644 --- a/grokmirror.conf +++ b/grokmirror.conf @@ -119,6 +119,10 @@ default_owner = Grokmirror User # number at a nice low setting. pull_threads = 5 # +# If git fetch fails, we will retry up to this many times before +# giving up and marking that repository as failed. +retries = 3 +# # Use shell-globbing to list the repositories you would like to mirror. # If you want to mirror everything, just say "*". Separate multiple entries # with newline plus tab. Examples: diff --git a/grokmirror/__init__.py b/grokmirror/__init__.py index b14e2ba..7b36e46 100644 --- a/grokmirror/__init__.py +++ b/grokmirror/__init__.py @@ -29,6 +29,7 @@ import uuid import tempfile import shutil import gzip +import datetime from fcntl import lockf, LOCK_EX, LOCK_UN, LOCK_NB @@ -221,6 +222,76 @@ def get_repo_obj_info(fullpath): return obj_info +def get_repo_defs(toplevel, gitdir, usenow=False): + fullpath = os.path.join(toplevel, gitdir.lstrip('/')) + description = None + try: + descfile = os.path.join(fullpath, 'description') + with open(descfile) as fh: + contents = fh.read().strip() + if len(contents) and contents.find('edit this file') < 0: + # We don't need to tell mirrors to edit this file + description = contents + except IOError: + pass + + entries = get_config_from_git(fullpath, r'gitweb\..*') + owner = entries.get('owner', None) + + modified = 0 + + if not usenow: + args = ['for-each-ref', '--sort=-committerdate', '--format=%(committerdate:iso-strict)', '--count=1'] + ecode, out, err = run_git_command(fullpath, args) + if len(out): + try: + modified = datetime.datetime.fromisoformat(out) + except AttributeError: + # Python 3.6 doesn't have fromisoformat + # remove : from the TZ info + out = out[:-3] + out[-2:] + modified = datetime.datetime.strptime(out, '%Y-%m-%dT%H:%M:%S%z') + + if not modified: + modified = datetime.datetime.now() + + head = None + try: + with open(os.path.join(fullpath, 'HEAD')) as fh: + head = fh.read().strip() + except IOError: + pass + + forkgroup = None + altrepo = get_altrepo(fullpath) + if altrepo and os.path.exists(os.path.join(altrepo, 'grokmirror.objstore')): + forkgroup = os.path.basename(altrepo)[:-4] + + # we need a way to quickly compare whether mirrored repositories match + # what is in the master manifest. To this end, we calculate a so-called + # "state fingerprint" -- basically the output of "git show-ref | sha1sum". + # git show-ref output is deterministic and should accurately list all refs + # and their relation to heads/tags/etc. + fingerprint = get_repo_fingerprint(toplevel, gitdir, force=True) + # Record it in the repo for other use + set_repo_fingerprint(toplevel, gitdir, fingerprint) + repoinfo = { + 'modified': int(modified.timestamp()), + 'fingerprint': fingerprint, + 'head': head, + } + + # Don't add empty things to manifest + if owner: + repoinfo['owner'] = owner + if description: + repoinfo['description'] = description + if forkgroup: + repoinfo['forkgroup'] = forkgroup + + return repoinfo + + def get_altrepo(fullpath): altfile = os.path.join(fullpath, 'objects', 'info', 'alternates') altdir = None diff --git a/grokmirror/fsck.py b/grokmirror/fsck.py index b42c333..49390d9 100755 --- a/grokmirror/fsck.py +++ b/grokmirror/fsck.py @@ -420,8 +420,18 @@ def fsck_mirror(config, verbose=False, force=False, repack_only=False, logger.info('Running grok-fsck for [%s]', config['core'].get('toplevel')) + statusfile = config['fsck'].get('statusfile') + if not statusfile: + logger.critical('Please define fsck.statusfile in the config') + return 1 + + st_dir = os.path.dirname(statusfile) + if not os.path.isdir(os.path.dirname(statusfile)): + logger.critical('Directory %s is absent', st_dir) + return 1 + # Lock the tree to make sure we only run one instance - lockfile = config['core'].get('lock') + lockfile = os.path.join(st_dir, '.%s.lock' % os.path.basename(statusfile)) logger.debug('Attempting to obtain lock on %s', lockfile) flockh = open(lockfile, 'w') try: @@ -434,7 +444,6 @@ def fsck_mirror(config, verbose=False, force=False, repack_only=False, manifile = config['core'].get('manifest') manifest = grokmirror.read_manifest(manifile) - statusfile = config['fsck'].get('statusfile') if os.path.exists(statusfile): logger.info('Reading status from %s', statusfile) stfh = open(statusfile, 'r') @@ -543,7 +552,7 @@ def fsck_mirror(config, verbose=False, force=False, repack_only=False, gitdir = fullpath.replace(toplevel, '', 1) gitdir = '/' + gitdir.lstrip('/') - if gitdir not in manifest.keys(): + if gitdir not in manifest: status.pop(fullpath) logger.debug('%s is gone, no longer in manifest', gitdir) continue @@ -848,6 +857,9 @@ def fsck_mirror(config, verbose=False, force=False, repack_only=False, logger.info('Fetching %s into %s', gitdir, os.path.basename(obstrepo)) grokmirror.fetch_objstore_repo(obstrepo, childpath) + if gitdir not in manifest: + continue + if refrepo is None: # Legacy "reference=" setting in manifest refrepo = gitdir diff --git a/grokmirror/manifest.py b/grokmirror/manifest.py index a7f2a27..e628c62 100755 --- a/grokmirror/manifest.py +++ b/grokmirror/manifest.py @@ -39,6 +39,8 @@ def update_manifest(manifest, toplevel, fullpath, usenow): logger.info('%s has no heads, ignoring', gitdir) return + repoinfo = grokmirror.get_repo_defs(toplevel, gitdir, usenow=usenow) + if gitdir not in manifest: # We didn't normalize paths to be always with a leading '/', so # check the manifest for both and make sure we only save the path with a leading / @@ -51,84 +53,24 @@ def update_manifest(manifest, toplevel, fullpath, usenow): else: logger.info('Updating %s in the manifest', gitdir) - description = None - try: - descfile = os.path.join(fullpath, 'description') - with open(descfile) as fh: - contents = fh.read().strip() - if len(contents) and contents.find('edit this file') < 0: - # We don't need to tell mirrors to edit this file - description = contents - except IOError: - pass - - entries = grokmirror.get_config_from_git(fullpath, r'gitweb\..*') - owner = entries.get('owner', None) - - modified = 0 - - if not usenow: - args = ['for-each-ref', '--sort=-committerdate', '--format=%(committerdate:iso-strict)', '--count=1'] - ecode, out, err = grokmirror.run_git_command(fullpath, args) - if len(out): - try: - modified = datetime.datetime.fromisoformat(out) - except AttributeError: - # Python 3.6 doesn't have fromisoformat - # remove : from the TZ info - out = out[:-3] + out[-2:] - modified = datetime.datetime.strptime(out, '%Y-%m-%dT%H:%M:%S%z') - - if not modified: - modified = datetime.datetime.now() - - head = None - try: - with open(os.path.join(fullpath, 'HEAD')) as fh: - head = fh.read().strip() - except IOError: - pass - - reference = None - forkgroup = None altrepo = grokmirror.get_altrepo(fullpath) - if altrepo: - if os.path.exists(os.path.join(altrepo, 'grokmirror.objstore')): - forkgroup = os.path.basename(altrepo)[:-4] - old_forkgroup = manifest[gitdir].get('forkgroup', None) - if old_forkgroup != forkgroup: - # Use the first remote listed in the forkgroup as our reference, just so - # grokmirror-1.x clients continue to work without doing full clones - remotes = grokmirror.list_repo_remotes(altrepo, withurl=True) - if len(remotes): - urls = list(x[1] for x in remotes) - urls.sort() - reference = '/' + os.path.relpath(urls[0], toplevel) - else: - reference = manifest[gitdir].get('reference', None) - else: - # Not an objstore repo - reference = '/' + os.path.relpath(altrepo, toplevel) - - # we need a way to quickly compare whether mirrored repositories match - # what is in the master manifest. To this end, we calculate a so-called - # "state fingerprint" -- basically the output of "git show-ref | sha1sum". - # git show-ref output is deterministic and should accurately list all refs - # and their relation to heads/tags/etc. - fingerprint = grokmirror.get_repo_fingerprint(toplevel, gitdir, force=True) - # Record it in the repo for other use - grokmirror.set_repo_fingerprint(toplevel, gitdir, fingerprint) - - manifest[gitdir]['modified'] = int(modified.timestamp()) - manifest[gitdir]['fingerprint'] = fingerprint - manifest[gitdir]['head'] = head - # Don't add empty things to manifest - if owner: - manifest[gitdir]['owner'] = owner - if description: - manifest[gitdir]['description'] = description - if forkgroup: - manifest[gitdir]['forkgroup'] = forkgroup + reference = None + if manifest[gitdir].get('forkgroup', None) != repoinfo.get('forkgroup', None): + # Use the first remote listed in the forkgroup as our reference, just so + # grokmirror-1.x clients continue to work without doing full clones + remotes = grokmirror.list_repo_remotes(altrepo, withurl=True) + if len(remotes): + urls = list(x[1] for x in remotes) + urls.sort() + reference = '/' + os.path.relpath(urls[0], toplevel) + else: + reference = manifest[gitdir].get('reference', None) + + if altrepo and not reference and not repoinfo.get('forkgroup'): + # Not an objstore repo + reference = '/' + os.path.relpath(altrepo, toplevel) + + manifest[gitdir].update(repoinfo) if reference: manifest[gitdir]['reference'] = reference @@ -267,6 +209,8 @@ def grok_manifest(manifile, toplevel, args=None, logfile=None, usenow=False, grokmirror.manifest_lock(manifile) manifest = grokmirror.read_manifest(manifile, wait=wait) + toplevel = os.path.realpath(toplevel) + # If manifest is empty, don't use current timestamp if not len(manifest.keys()): usenow = False @@ -301,7 +245,7 @@ def grok_manifest(manifile, toplevel, args=None, logfile=None, usenow=False, # limit ourselves to passed dirs only when there is something # in the manifest. This precaution makes sure we regenerate the # whole file when there is nothing in it or it can't be parsed. - gitdirs = args + gitdirs = [os.path.realpath(x) for x in args] # Don't draw a progress bar for a single repo em.enabled = False @@ -340,8 +284,12 @@ def grok_manifest(manifile, toplevel, args=None, logfile=None, usenow=False, run.close() em.stop() + fetched = set() for gitdir in tofetch: altrepo = grokmirror.get_altrepo(gitdir) + if altrepo in fetched: + continue + fetched.add(altrepo) if altrepo and os.path.exists(os.path.join(altrepo, 'grokmirror.objstore')): logger.info('Fetching objects into %s', os.path.basename(altrepo)) grokmirror.fetch_objstore_repo(altrepo, gitdir) diff --git a/grokmirror/pull.py b/grokmirror/pull.py index 5d25d90..c2d3a2d 100755 --- a/grokmirror/pull.py +++ b/grokmirror/pull.py @@ -829,6 +829,8 @@ def pull_mirror(config, verbose=False, force=False, nomtime=False, privmasks = config['core'].get('private', '').split('\n') mapping = grokmirror.get_obstrepo_mapping(obstdir) freshclones = set() + retries = dict() + maxretries = config['pull'].getint('retries', 3) with mp.Pool(pull_threads) as wpool: results = list() while len(results) or len(todo): @@ -841,7 +843,12 @@ def pull_mirror(config, verbose=False, force=False, nomtime=False, if action == 'objstore': # Objstore actions aren't in the initial set, because we add them # on the fly as we repurpose existing repos for objstore - repoinfo = l_manifest[gitrepo] + if gitrepo in l_manifest: + repoinfo = l_manifest[gitrepo] + else: + # This repo is in neither manifest, but it's on disk, so we may + # as well reuse it. + repoinfo = grokmirror.get_repo_defs(toplevel, gitrepo) else: repoinfo = r_culled[gitrepo] todo.remove((gitrepo, action)) @@ -857,18 +864,31 @@ def pull_mirror(config, verbose=False, force=False, nomtime=False, success, gitrepo, action, next_action, obstrepo, is_private = res.get(timeout=0.1) logger.debug('result: repo=%s, action=%s, next=%s', gitrepo, action, next_action) if not success: - logger.info(' Failed: %s', gitrepo) - failures += 1 - # To make sure we check this again during next run, - # fudge the manifest accordingly. - if gitrepo in l_manifest: - r_culled[gitrepo] = l_manifest[gitrepo] - # this is rather hackish, but effective - r_last_modified -= 1 - if obstrepo and obstrepo in pending_obstrepos: - pending_obstrepos.remove(obstrepo) - logger.debug('marked available obstrepo %s', obstrepo) - continue + if action == 'pull' and (gitrepo not in retries or gitrepo[retries] <= maxretries): + # Let's retry pulls a few times, just in case there was a network fluke + if gitrepo not in retries: + retries[gitrepo] = 1 + else: + retries[gitrepo] += 1 + logger.info(' Retry #%d: %s', retries[gitrepo], gitrepo) + next_action = 'pull' + + if next_action is None: + logger.info(' Failed: %s', gitrepo) + failures += 1 + # To make sure we check this again during next run, + # fudge the manifest accordingly. + if gitrepo in l_manifest: + r_culled[gitrepo] = l_manifest[gitrepo] + # this is rather hackish, but effective + r_last_modified -= 1 + if obstrepo and obstrepo in pending_obstrepos: + pending_obstrepos.remove(obstrepo) + logger.debug('marked available obstrepo %s', obstrepo) + + e_fin.update_from(e_que) + e_que.refresh() + continue if action == 'objstore' and gitrepo in freshclones: freshclones.remove(gitrepo) |