aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorKonstantin Ryabitsev <konstantin@linuxfoundation.org>2020-07-07 17:28:56 -0400
committerKonstantin Ryabitsev <konstantin@linuxfoundation.org>2020-07-07 17:28:56 -0400
commit549cfc2c040fd946641a9bd81a04a0695b811f50 (patch)
tree3ac8ff751970961a552daf4fb5d9cf32c0933ba7
parent227a53004325ceeb24fef4cb9e5c090eb4e5ac77 (diff)
downloadgrokmirror-549cfc2c040fd946641a9bd81a04a0695b811f50.tar.gz
Add support for retries
Cloning 28,000 repositories to China brings out a whole bunch of ugly things to the surface, such as intermittent TCP/HTTP resets. Implement retrying if pull is unsuccessful, up to a defined number of times. Additionally, a mix of various fixes. Signed-off-by: Konstantin Ryabitsev <konstantin@linuxfoundation.org>
-rw-r--r--grokmirror.conf4
-rw-r--r--grokmirror/__init__.py71
-rwxr-xr-xgrokmirror/fsck.py18
-rwxr-xr-xgrokmirror/manifest.py104
-rwxr-xr-xgrokmirror/pull.py46
5 files changed, 149 insertions, 94 deletions
diff --git a/grokmirror.conf b/grokmirror.conf
index 116a137..c88792c 100644
--- a/grokmirror.conf
+++ b/grokmirror.conf
@@ -119,6 +119,10 @@ default_owner = Grokmirror User
# number at a nice low setting.
pull_threads = 5
#
+# If git fetch fails, we will retry up to this many times before
+# giving up and marking that repository as failed.
+retries = 3
+#
# Use shell-globbing to list the repositories you would like to mirror.
# If you want to mirror everything, just say "*". Separate multiple entries
# with newline plus tab. Examples:
diff --git a/grokmirror/__init__.py b/grokmirror/__init__.py
index b14e2ba..7b36e46 100644
--- a/grokmirror/__init__.py
+++ b/grokmirror/__init__.py
@@ -29,6 +29,7 @@ import uuid
import tempfile
import shutil
import gzip
+import datetime
from fcntl import lockf, LOCK_EX, LOCK_UN, LOCK_NB
@@ -221,6 +222,76 @@ def get_repo_obj_info(fullpath):
return obj_info
+def get_repo_defs(toplevel, gitdir, usenow=False):
+ fullpath = os.path.join(toplevel, gitdir.lstrip('/'))
+ description = None
+ try:
+ descfile = os.path.join(fullpath, 'description')
+ with open(descfile) as fh:
+ contents = fh.read().strip()
+ if len(contents) and contents.find('edit this file') < 0:
+ # We don't need to tell mirrors to edit this file
+ description = contents
+ except IOError:
+ pass
+
+ entries = get_config_from_git(fullpath, r'gitweb\..*')
+ owner = entries.get('owner', None)
+
+ modified = 0
+
+ if not usenow:
+ args = ['for-each-ref', '--sort=-committerdate', '--format=%(committerdate:iso-strict)', '--count=1']
+ ecode, out, err = run_git_command(fullpath, args)
+ if len(out):
+ try:
+ modified = datetime.datetime.fromisoformat(out)
+ except AttributeError:
+ # Python 3.6 doesn't have fromisoformat
+ # remove : from the TZ info
+ out = out[:-3] + out[-2:]
+ modified = datetime.datetime.strptime(out, '%Y-%m-%dT%H:%M:%S%z')
+
+ if not modified:
+ modified = datetime.datetime.now()
+
+ head = None
+ try:
+ with open(os.path.join(fullpath, 'HEAD')) as fh:
+ head = fh.read().strip()
+ except IOError:
+ pass
+
+ forkgroup = None
+ altrepo = get_altrepo(fullpath)
+ if altrepo and os.path.exists(os.path.join(altrepo, 'grokmirror.objstore')):
+ forkgroup = os.path.basename(altrepo)[:-4]
+
+ # we need a way to quickly compare whether mirrored repositories match
+ # what is in the master manifest. To this end, we calculate a so-called
+ # "state fingerprint" -- basically the output of "git show-ref | sha1sum".
+ # git show-ref output is deterministic and should accurately list all refs
+ # and their relation to heads/tags/etc.
+ fingerprint = get_repo_fingerprint(toplevel, gitdir, force=True)
+ # Record it in the repo for other use
+ set_repo_fingerprint(toplevel, gitdir, fingerprint)
+ repoinfo = {
+ 'modified': int(modified.timestamp()),
+ 'fingerprint': fingerprint,
+ 'head': head,
+ }
+
+ # Don't add empty things to manifest
+ if owner:
+ repoinfo['owner'] = owner
+ if description:
+ repoinfo['description'] = description
+ if forkgroup:
+ repoinfo['forkgroup'] = forkgroup
+
+ return repoinfo
+
+
def get_altrepo(fullpath):
altfile = os.path.join(fullpath, 'objects', 'info', 'alternates')
altdir = None
diff --git a/grokmirror/fsck.py b/grokmirror/fsck.py
index b42c333..49390d9 100755
--- a/grokmirror/fsck.py
+++ b/grokmirror/fsck.py
@@ -420,8 +420,18 @@ def fsck_mirror(config, verbose=False, force=False, repack_only=False,
logger.info('Running grok-fsck for [%s]', config['core'].get('toplevel'))
+ statusfile = config['fsck'].get('statusfile')
+ if not statusfile:
+ logger.critical('Please define fsck.statusfile in the config')
+ return 1
+
+ st_dir = os.path.dirname(statusfile)
+ if not os.path.isdir(os.path.dirname(statusfile)):
+ logger.critical('Directory %s is absent', st_dir)
+ return 1
+
# Lock the tree to make sure we only run one instance
- lockfile = config['core'].get('lock')
+ lockfile = os.path.join(st_dir, '.%s.lock' % os.path.basename(statusfile))
logger.debug('Attempting to obtain lock on %s', lockfile)
flockh = open(lockfile, 'w')
try:
@@ -434,7 +444,6 @@ def fsck_mirror(config, verbose=False, force=False, repack_only=False,
manifile = config['core'].get('manifest')
manifest = grokmirror.read_manifest(manifile)
- statusfile = config['fsck'].get('statusfile')
if os.path.exists(statusfile):
logger.info('Reading status from %s', statusfile)
stfh = open(statusfile, 'r')
@@ -543,7 +552,7 @@ def fsck_mirror(config, verbose=False, force=False, repack_only=False,
gitdir = fullpath.replace(toplevel, '', 1)
gitdir = '/' + gitdir.lstrip('/')
- if gitdir not in manifest.keys():
+ if gitdir not in manifest:
status.pop(fullpath)
logger.debug('%s is gone, no longer in manifest', gitdir)
continue
@@ -848,6 +857,9 @@ def fsck_mirror(config, verbose=False, force=False, repack_only=False,
logger.info('Fetching %s into %s', gitdir, os.path.basename(obstrepo))
grokmirror.fetch_objstore_repo(obstrepo, childpath)
+ if gitdir not in manifest:
+ continue
+
if refrepo is None:
# Legacy "reference=" setting in manifest
refrepo = gitdir
diff --git a/grokmirror/manifest.py b/grokmirror/manifest.py
index a7f2a27..e628c62 100755
--- a/grokmirror/manifest.py
+++ b/grokmirror/manifest.py
@@ -39,6 +39,8 @@ def update_manifest(manifest, toplevel, fullpath, usenow):
logger.info('%s has no heads, ignoring', gitdir)
return
+ repoinfo = grokmirror.get_repo_defs(toplevel, gitdir, usenow=usenow)
+
if gitdir not in manifest:
# We didn't normalize paths to be always with a leading '/', so
# check the manifest for both and make sure we only save the path with a leading /
@@ -51,84 +53,24 @@ def update_manifest(manifest, toplevel, fullpath, usenow):
else:
logger.info('Updating %s in the manifest', gitdir)
- description = None
- try:
- descfile = os.path.join(fullpath, 'description')
- with open(descfile) as fh:
- contents = fh.read().strip()
- if len(contents) and contents.find('edit this file') < 0:
- # We don't need to tell mirrors to edit this file
- description = contents
- except IOError:
- pass
-
- entries = grokmirror.get_config_from_git(fullpath, r'gitweb\..*')
- owner = entries.get('owner', None)
-
- modified = 0
-
- if not usenow:
- args = ['for-each-ref', '--sort=-committerdate', '--format=%(committerdate:iso-strict)', '--count=1']
- ecode, out, err = grokmirror.run_git_command(fullpath, args)
- if len(out):
- try:
- modified = datetime.datetime.fromisoformat(out)
- except AttributeError:
- # Python 3.6 doesn't have fromisoformat
- # remove : from the TZ info
- out = out[:-3] + out[-2:]
- modified = datetime.datetime.strptime(out, '%Y-%m-%dT%H:%M:%S%z')
-
- if not modified:
- modified = datetime.datetime.now()
-
- head = None
- try:
- with open(os.path.join(fullpath, 'HEAD')) as fh:
- head = fh.read().strip()
- except IOError:
- pass
-
- reference = None
- forkgroup = None
altrepo = grokmirror.get_altrepo(fullpath)
- if altrepo:
- if os.path.exists(os.path.join(altrepo, 'grokmirror.objstore')):
- forkgroup = os.path.basename(altrepo)[:-4]
- old_forkgroup = manifest[gitdir].get('forkgroup', None)
- if old_forkgroup != forkgroup:
- # Use the first remote listed in the forkgroup as our reference, just so
- # grokmirror-1.x clients continue to work without doing full clones
- remotes = grokmirror.list_repo_remotes(altrepo, withurl=True)
- if len(remotes):
- urls = list(x[1] for x in remotes)
- urls.sort()
- reference = '/' + os.path.relpath(urls[0], toplevel)
- else:
- reference = manifest[gitdir].get('reference', None)
- else:
- # Not an objstore repo
- reference = '/' + os.path.relpath(altrepo, toplevel)
-
- # we need a way to quickly compare whether mirrored repositories match
- # what is in the master manifest. To this end, we calculate a so-called
- # "state fingerprint" -- basically the output of "git show-ref | sha1sum".
- # git show-ref output is deterministic and should accurately list all refs
- # and their relation to heads/tags/etc.
- fingerprint = grokmirror.get_repo_fingerprint(toplevel, gitdir, force=True)
- # Record it in the repo for other use
- grokmirror.set_repo_fingerprint(toplevel, gitdir, fingerprint)
-
- manifest[gitdir]['modified'] = int(modified.timestamp())
- manifest[gitdir]['fingerprint'] = fingerprint
- manifest[gitdir]['head'] = head
- # Don't add empty things to manifest
- if owner:
- manifest[gitdir]['owner'] = owner
- if description:
- manifest[gitdir]['description'] = description
- if forkgroup:
- manifest[gitdir]['forkgroup'] = forkgroup
+ reference = None
+ if manifest[gitdir].get('forkgroup', None) != repoinfo.get('forkgroup', None):
+ # Use the first remote listed in the forkgroup as our reference, just so
+ # grokmirror-1.x clients continue to work without doing full clones
+ remotes = grokmirror.list_repo_remotes(altrepo, withurl=True)
+ if len(remotes):
+ urls = list(x[1] for x in remotes)
+ urls.sort()
+ reference = '/' + os.path.relpath(urls[0], toplevel)
+ else:
+ reference = manifest[gitdir].get('reference', None)
+
+ if altrepo and not reference and not repoinfo.get('forkgroup'):
+ # Not an objstore repo
+ reference = '/' + os.path.relpath(altrepo, toplevel)
+
+ manifest[gitdir].update(repoinfo)
if reference:
manifest[gitdir]['reference'] = reference
@@ -267,6 +209,8 @@ def grok_manifest(manifile, toplevel, args=None, logfile=None, usenow=False,
grokmirror.manifest_lock(manifile)
manifest = grokmirror.read_manifest(manifile, wait=wait)
+ toplevel = os.path.realpath(toplevel)
+
# If manifest is empty, don't use current timestamp
if not len(manifest.keys()):
usenow = False
@@ -301,7 +245,7 @@ def grok_manifest(manifile, toplevel, args=None, logfile=None, usenow=False,
# limit ourselves to passed dirs only when there is something
# in the manifest. This precaution makes sure we regenerate the
# whole file when there is nothing in it or it can't be parsed.
- gitdirs = args
+ gitdirs = [os.path.realpath(x) for x in args]
# Don't draw a progress bar for a single repo
em.enabled = False
@@ -340,8 +284,12 @@ def grok_manifest(manifile, toplevel, args=None, logfile=None, usenow=False,
run.close()
em.stop()
+ fetched = set()
for gitdir in tofetch:
altrepo = grokmirror.get_altrepo(gitdir)
+ if altrepo in fetched:
+ continue
+ fetched.add(altrepo)
if altrepo and os.path.exists(os.path.join(altrepo, 'grokmirror.objstore')):
logger.info('Fetching objects into %s', os.path.basename(altrepo))
grokmirror.fetch_objstore_repo(altrepo, gitdir)
diff --git a/grokmirror/pull.py b/grokmirror/pull.py
index 5d25d90..c2d3a2d 100755
--- a/grokmirror/pull.py
+++ b/grokmirror/pull.py
@@ -829,6 +829,8 @@ def pull_mirror(config, verbose=False, force=False, nomtime=False,
privmasks = config['core'].get('private', '').split('\n')
mapping = grokmirror.get_obstrepo_mapping(obstdir)
freshclones = set()
+ retries = dict()
+ maxretries = config['pull'].getint('retries', 3)
with mp.Pool(pull_threads) as wpool:
results = list()
while len(results) or len(todo):
@@ -841,7 +843,12 @@ def pull_mirror(config, verbose=False, force=False, nomtime=False,
if action == 'objstore':
# Objstore actions aren't in the initial set, because we add them
# on the fly as we repurpose existing repos for objstore
- repoinfo = l_manifest[gitrepo]
+ if gitrepo in l_manifest:
+ repoinfo = l_manifest[gitrepo]
+ else:
+ # This repo is in neither manifest, but it's on disk, so we may
+ # as well reuse it.
+ repoinfo = grokmirror.get_repo_defs(toplevel, gitrepo)
else:
repoinfo = r_culled[gitrepo]
todo.remove((gitrepo, action))
@@ -857,18 +864,31 @@ def pull_mirror(config, verbose=False, force=False, nomtime=False,
success, gitrepo, action, next_action, obstrepo, is_private = res.get(timeout=0.1)
logger.debug('result: repo=%s, action=%s, next=%s', gitrepo, action, next_action)
if not success:
- logger.info(' Failed: %s', gitrepo)
- failures += 1
- # To make sure we check this again during next run,
- # fudge the manifest accordingly.
- if gitrepo in l_manifest:
- r_culled[gitrepo] = l_manifest[gitrepo]
- # this is rather hackish, but effective
- r_last_modified -= 1
- if obstrepo and obstrepo in pending_obstrepos:
- pending_obstrepos.remove(obstrepo)
- logger.debug('marked available obstrepo %s', obstrepo)
- continue
+ if action == 'pull' and (gitrepo not in retries or gitrepo[retries] <= maxretries):
+ # Let's retry pulls a few times, just in case there was a network fluke
+ if gitrepo not in retries:
+ retries[gitrepo] = 1
+ else:
+ retries[gitrepo] += 1
+ logger.info(' Retry #%d: %s', retries[gitrepo], gitrepo)
+ next_action = 'pull'
+
+ if next_action is None:
+ logger.info(' Failed: %s', gitrepo)
+ failures += 1
+ # To make sure we check this again during next run,
+ # fudge the manifest accordingly.
+ if gitrepo in l_manifest:
+ r_culled[gitrepo] = l_manifest[gitrepo]
+ # this is rather hackish, but effective
+ r_last_modified -= 1
+ if obstrepo and obstrepo in pending_obstrepos:
+ pending_obstrepos.remove(obstrepo)
+ logger.debug('marked available obstrepo %s', obstrepo)
+
+ e_fin.update_from(e_que)
+ e_que.refresh()
+ continue
if action == 'objstore' and gitrepo in freshclones:
freshclones.remove(gitrepo)