# SPDX-License-Identifier: GPL-2.0-or-later # Copyright 2023 Google LLC from pathlib import Path import datetime import fcntl import gzip import hashlib import mailbox import pickle import re import subprocess import sys import tempfile import urllib.parse from bs4 import BeautifulSoup import requests class Config: """The configuration. The default values defined below can be overridden by a local file config.py. See the file README.md.""" def __init__(self): # Print debug messages? self.verbose = False # Path to the Linux git repo to use self.linux_dir = '.' # First git commit to consider when looking at the history. Any commits # before this will be ignored to improve performance. self.start_of_history = 'v4.0' # URL of the public-inbox server to query self.lore = 'https://lore.kernel.org' # User-Agent header to use in requests to the server self.user_agent = 'stable_utils.py/0.1' # Amount of time to locally cache responses from the server before an # identical request will be re-attempted again self.lore_cache_timeout = datetime.timedelta(hours=24) # The git ref name for the latest mainline self.upstream = 'origin/master' config = Config() try: import config as local_config local_config.customize(config) except ModuleNotFoundError: pass SCRIPT_DIR = Path(__file__).parent LORE_RESULTS_PER_PAGE = 200 LORE_RESULTS_SECTION_REGEX = re.compile('- by .* @ [0-9]{4}-[0-9]{2}-[0-9]{2}') PATCH_NUMBER_REGEX = re.compile('\\[.*\\s([0-9]+)/([0-9]+).*\\]') BACKPORT_PATCH_SUBJECT_REGEX = re.compile('[0-9]+\\.[0-9]+') BACKPORT_PATCH_BODY_REGEX = re.compile('(commit [0-9a-f]{40} upstream)|(upstream commit [0-9]{40})', re.IGNORECASE) WHITESPACE_REGEX = re.compile('\\s+') def debug(string): """Prints a DEBUG message if verbose mode is enabled.""" if config.verbose: sys.stderr.write(f'[DEBUG] {string}\n') def warn(string): """Prints a WARNING message.""" sys.stderr.write(f'[WARNING] {string}\n') def error(string): """Prints an ERROR message and exits with failure status.""" sys.stderr.write(f'[ERROR] {string}\n') sys.exit(1) class Cache: """A cache that maps string keys to bytes values, implemented as a directory. Multi-process safe. Keys must be valid filenames.""" def __init__(self, name, version, timeout): self._dir_path = SCRIPT_DIR / name self._timeout = timeout self._lock_file = SCRIPT_DIR / (name + '.lock') with self._locked(): version_file = SCRIPT_DIR / (name + '.version') if self._dir_path.exists(): try: cur_version = int(version_file.read_text()) except FileNotFoundError: cur_version = 0 if cur_version != version: debug(f'Clearing {name}, as it has a different version number') # Delete all expired files. for path in self._dir_path.iterdir(): if cur_version != version or self._is_file_expired(path): path.unlink() if cur_version != version: version_file.write_text(f'{version}\n') return self._dir_path.mkdir() version_file.write_text(f'{version}\n') def read(self, key): """Returns the cached value for the given key, or None.""" with self._locked(): path = self._dir_path / key if self._is_file_expired(path): return None return path.read_bytes() def write(self, key, value): """Writes a key-value pair to this Cache.""" with self._locked(): path = self._dir_path / key tmp_path = self._dir_path / (key + '.tmp') tmp_path.write_bytes(value) tmp_path.rename(path) def _is_file_expired(self, path): try: age = (datetime.datetime.now() - datetime.datetime.fromtimestamp(path.stat().st_mtime)) return age > self._timeout except FileNotFoundError: return True def _locked(self): return FileLock(self._lock_file) class FileLock: """An exclusive file lock, usable with Python's 'with' statement.""" def __init__(self, path): self._path = path self._file = None def __enter__(self): self._file = open(self._path, 'wb') fcntl.lockf(self._file, fcntl.LOCK_EX) def __exit__(self, exception_type, exception_value, traceback): fcntl.lockf(self._file, fcntl.LOCK_UN) self._file.close() lore_cache = Cache('lore_cache', 1, config.lore_cache_timeout) git_cache = Cache('git_cache', 1, datetime.timedelta(days=30)) def lore_request(url_path, post=False): """Makes a GET or POST request to a public-inbox server, with caching. On success, returns the resulting content as bytes. On 404 error, returns None. On other error, raises an exception.""" method = 'POST' if post else 'GET' url = config.lore + '/' + url_path req_hash = hashlib.sha256(f'{method} {url}'.encode('utf-8')).hexdigest() # Return a cached response if possible. content = lore_cache.read(req_hash) if content: debug(f'Cache hit for {method} {url}') return content # Cache miss; make the actual request. debug(f'{method} {url}') headers = {'User-Agent': config.user_agent} if post: req = requests.post(url, timeout=30, headers=headers) else: req = requests.get(url, timeout=30, headers=headers) if req.status_code == 404: return None req.raise_for_status() # Decompress the response if needed. content = req.content if content[:3] == b'\x1f\x8B\x08': content = gzip.decompress(content) # Cache and return the response. lore_cache.write(req_hash, content) return content def fetch_raw_message(message_id): """Fetches a message from the mailing list archive, given its Message-Id. Returns the message as bytes, or None if the message isn't found.""" return lore_request(f'all/{urllib.parse.quote(message_id, safe="")}/raw') def fetch_message(message_id): """Fetches a message from the mailing list archive, given its Message-Id. Returns the message as a mailbox.Message object, or None if the message isn't found.""" content = fetch_raw_message(message_id) if not content: return None return mailbox.Message(content) def fetch_thread(message_id): """Fetches a thread from the mailing list archive, given the Message-Id of any message contained in the thread. Returns the thread as a mailbox.mbox object, or None if the thread isn't found.""" content = lore_request(f'all/{urllib.parse.quote(message_id, safe="")}/t.mbox.gz') if not content: return None with tempfile.NamedTemporaryFile() as file: file.write(content) return mailbox.mbox(file.name) def list_matching_emails(query_string): """Searches the mailing list archive for email messages that match the given search query string. The search results are generated as (message_id, subject) tuples in reverse chronological order. For the supported search query syntax, see https://lore.kernel.org/all/_/text/help/""" offset = 0 while True: content = lore_request(f'all/?q={urllib.parse.quote(query_string, safe="")}&o={offset}') if not content: break soup = BeautifulSoup(content, 'html.parser') results = next((pre for pre in soup.find_all('pre') if LORE_RESULTS_SECTION_REGEX.search(pre.text)), None) if not results: break count = 0 for a_element in results.find_all('a'): yield (urllib.parse.unquote(a_element['href'].rstrip('/')), a_element.text) count += 1 if count < LORE_RESULTS_PER_PAGE: break offset += count # Similar to list_matching_emails(), but downloads the full messages. # def fetch_matching_emails(query_string, offset=0): # return lore_request(f'all/?q={urllib.parse.quote(query_string, safe="")}&o={offset}&x=m', post=True) def git(args): """Runs a git command on linux_dir and returns its output as a string.""" args = ['git'] + args debug('Running command: ' + str(args)) try: result = subprocess.run(args, cwd=config.linux_dir, check=True, capture_output=True) except subprocess.CalledProcessError as ex: sys.stderr.buffer.write(ex.stderr) error(str(ex)) return result.stdout.decode('utf-8', errors='replace').rstrip() def normalize_title(string): """Normalizes a commit title or PATCH email subject by normalizing whitespace, stripping bracketed sections, and converting to lower case.""" string = string.strip() string = WHITESPACE_REGEX.sub(' ', string) string = string.lower() while string.startswith('['): i = string.find(']') if i == -1: break string = string[i + 1:].strip() return string class Commit: """A git commit.""" def __init__(self, commit_id): self.id = commit_id self._title = None self._body = None def get_title(self): """Returns the title of this commit.""" if not self._title: self._title = git(['log', '--pretty=%s', '-1', self.id]) return self._title def get_body(self): """Returns the body of this commit.""" if not self._body: self._body = git(['log', '--pretty=%b', '-1', self.id]) return self._body def get_message_ids(self): """Returns the list of email Message IDs that are mentioned in this Commit's body via Message-Id and Link tags.""" ids = [] for line in self.get_body().split('\n'): if line.startswith('Message-Id:'): ids.append(urllib.parse.unquote(line.split()[1].strip('<>'))) elif line.startswith('Link:'): link = line.split()[1] if ('/lore.kernel.org/' in link or '/lkml.kernel.org/' in link or '/patch/msgid/' in link): ids.append(urllib.parse.unquote(link.strip('/').split('/')[-1])) return ids def find_original_email(self): """Tries to find the original PATCH email for this Commit. Returns the message ID of the original patch, or None if no original patch is found. This is not 100% reliable, as it relies on heuristics.""" normalized_title = normalize_title(self.get_title()) # First look for a matching "Message-Id:" or "Link:" in the commit body. for message_id in self.get_message_ids(): msg = fetch_message(message_id) if msg and normalized_title == normalize_title(msg['Subject']): return message_id # Fall back to a search by commit title. debug(f'Falling back to search by commit title for {self}') potential_matches = [result for result in self.list_matching_emails() if normalized_title == normalize_title(result[1])] # Take the first (chronologically last) patch that doesn't look like a # backport -- that is, doesn't contain a number like "5.10" in the # subject line and doesn't contain a line like "commit # 89d77f71f493a3663b10fa812d17f472935d24be upstream" in the body. for (message_id, subject) in potential_matches: if (not BACKPORT_PATCH_SUBJECT_REGEX.search(subject) and not BACKPORT_PATCH_BODY_REGEX.search(str(fetch_message(message_id)))): return message_id # If that still didn't work, then maybe the original patch looked like a # backport. Take the first (chronologically last) patch that didn't # have stable@ in recipients. for (message_id, subject) in potential_matches: msg = fetch_message(message_id) if ('stable@vger.kernel.org' not in msg['To'] and 'stable@vger.kernel.org' not in msg['Cc']): return message_id # Nothing worked, oh well... debug(f'Cannot find original email for {self}') return None def list_matching_emails(self): """Lists the emails that have this commit's title in their subject.""" return list_matching_emails(f's:"{self.get_title()}"') def is_autosel(self): """Returns true if this commit corresponds to an AUTOSEL patch.""" # This is an over-simplistic way to do it, but maybe it's good enough. return any('AUTOSEL' in subject for (_, subject) in self.list_matching_emails()) def __str__(self): return f'commit {self.id[:12]} ("{self.get_title()}")' def __repr__(self): return self.id def get_message_id(msg): """Returns the Message-Id of a mailbox.Message object.""" return msg['Message-Id'].strip().strip('<>') def find_patches_in_same_series(message_id): """Tries to find the patch series containing the patch with the given message ID. On success, returns the array of patch Messages of length N+1, where N is the number of patches in the series. Index 0 contains the cover letter, or None if no cover letter was found. On failure, returns None.""" thread = fetch_thread(message_id) if not thread: warn(f'Failed to fetch thread containing {message_id}') return None target_patch = next((msg for msg in thread if message_id == get_message_id(msg)), None) if not target_patch: warn(f'Thread of {message_id} does not contain itself!') return None target_subject = target_patch['Subject'] match = PATCH_NUMBER_REGEX.search(target_subject) if not match: # standalone patch return [None, target_patch] target_patch_idx = int(match.group(1)) num_patches = int(match.group(2)) if target_patch_idx > num_patches: warn(f'Invalid patch subject "{target_subject}"') return None patches = [None] * (num_patches + 1) patches[target_patch_idx] = target_patch for msg in thread: subject = msg['Subject'].strip() if not subject.startswith('['): continue match = PATCH_NUMBER_REGEX.search(subject) if not match: continue i = int(match.group(1)) if i > num_patches or int(match.group(2)) != num_patches: debug(f'Ignoring "{subject}" since it is inconsistent with series containing {message_id}') continue if patches[i]: # Duplicates happen frequently. continue patches[i] = msg if any(not patch for patch in patches[1:]): debug(f'Some patches of series containing {message_id} were not found') return None return patches class GitHistoryIndex(): """A data structure that maps (normalized) git commit title to the list of commit IDs that have that title.""" def __init__(self): self._dict = {} def append(self, start_commit, end_commit): """Appends the history from start_commit to end_commit to this index.""" for line in git(['log', '--pretty=%H %s', '--reverse', f'{start_commit}..{end_commit}']).split('\n'): # Careful: line.split(maxsplit=1) fails on Linux commit # 7b7abfe3dd81d659 which has an empty title! commit_id = bytes.fromhex(line[:40]) title = line[41:] self._dict.setdefault(normalize_title(title), []).append(commit_id) def __contains__(self, key): return key in self._dict def get(self, normalized_title): """Returns a list of commit IDs that have the given normalized title.""" return self._dict.get(normalized_title, []) def _extract_kernel_version(commit): major = -1 minor = -1 extraversion = '' for line in git(['show', f'{commit}:Makefile']).split('\n'): if line.startswith('VERSION = '): major = int(line.split()[2]) if line.startswith('PATCHLEVEL = '): minor = int(line.split()[2]) try: if line.startswith('EXTRAVERSION = '): extraversion = line.split()[2] except IndexError: pass if major < 0 or minor < 0: error(f'Failed to extract kernel major.minor version number at {commit}') return (major, minor, extraversion) def extract_kernel_version(commit): """Returns the last v{major}.{minor} tag that the given kernel commit is based on. Release candidates aren't counted, so if for example the commit is based on v6.4-rc1, this returns v6.3.""" (major, minor, extraversion) = _extract_kernel_version(commit) if 'rc' in extraversion: commit = f'v{major}.{minor}-rc1~1' (major, minor, extraversion) = _extract_kernel_version(commit) if extraversion: error(f'Unexpectedly found EXTRAVERSION at {commit}') return f'v{major}.{minor}' def get_history_index(end_commit): """Returns a GitHistoryIndex that indexes the history of the Linux kernel by commit title in the range config.start_of_history to end_commit. To speed up repeated executions, the index of the history until the current major.minor version is built on its own and is cached on-disk. The index of the history until end_commit is then generated by loading the cached index and appending the commits from major.minor to end_commit.""" baseline = extract_kernel_version(end_commit) histfile = f'history_{config.start_of_history}..{baseline}' try: content = git_cache.read(histfile) if not content: raise FileNotFoundError(f'{histfile} is not cached yet') index = pickle.loads(content) debug(f'Loaded {histfile}') except Exception as ex: debug(str(ex)) debug(f'Indexing Linux history {config.start_of_history}..{baseline}') index = GitHistoryIndex() index.append(config.start_of_history, baseline) debug(f'Writing {histfile}') git_cache.write(histfile, pickle.dumps(index)) index.append(baseline, end_commit) return index def find_missing_prereqs(commit_range, autosel_only=False): """For the given range of backported commits, finds commits that are backported without previous patches in their original series. Generates a (patches, backports, missing) tuple for each relevant patch series. 'patches' is the full original patch series, including the cover letter if availaible, as a list of mailbox.Message objects. 'backports' is a sorted list of tuples of (patch number, backported Commit). 'missing' is a sorted list of tuples of (patch number, missing upstream Commit).""" class Series: def __init__(self, patches): self.patches = patches self.backports = {} def all_message_ids(self): return (get_message_id(patch) for patch in self.patches[1:]) def add_backport(self, message_id, commit): for i in range(1, len(self.patches)): if message_id == get_message_id(self.patches[i]): self.backports[i] = commit return error(f'{message_id} maps to a series, but patch number not found') def find_original_email_fast(commit, message_id_to_series): """If the commit is explicitly tagged with a message ID that is already present in one of the already-downloaded threads, then assume that is the correct message ID. This saves some work.""" for message_id in commit.get_message_ids(): if message_id in message_id_to_series: debug(f'Already found the thread containing {commit}') return message_id return commit.find_original_email() # Expand the given commit range into a list of commit IDs. commit_ids = git(['log', '--reverse', '--pretty=%H', commit_range]).split('\n') # Build an index of the history until the last given commit. downstream_history_index = get_history_index(commit_ids[-1]) # Build an index of the history until the latest upstream. upstream_history_index = get_history_index(config.upstream) # All series seen so far all_series = [] # Map from message ID to series, for all messages in all series seen so far message_id_to_series = {} # For each specified backport commit... for (i, commit_id) in enumerate(commit_ids): debug(f'Processing commit {i+1} of {len(commit_ids)} [{commit_id}]') commit = Commit(commit_id) # Find the original patch email, then the patch series that contains it. message_id = find_original_email_fast(commit, message_id_to_series) if not message_id: continue series = message_id_to_series.get(message_id) if not series: patches = find_patches_in_same_series(message_id) if not patches: continue # Found a patch series that we haven't seen before. series = Series(patches) all_series.append(series) for mid in series.all_message_ids(): message_id_to_series[mid] = series # Keep track of which patches in the series have a backport commit in # the specified range. series.add_backport(message_id, commit) # For each series that was found... for (i, series) in enumerate(all_series): debug(f'Processing series {i+1} of {len(all_series)}') # Get the number of the last patch in the series that is backported. max_backported_patch_num = max(series.backports) missing = {} # Check whether any earlier patches in the series seem to be missing. # (For now, we don't check for *later* missing patches.) for i in range(1, max_backported_patch_num): # Is the patch being backported in the given commit range? if i in series.backports: continue patch = series.patches[i] # Was the patch already backported before the given commit range? title = normalize_title(patch['Subject']) if title in downstream_history_index: continue # Nope, it's probably missing. Try to find the corresponding # upstream commit. If it's successfully found, consider it missing. for cid in reversed(upstream_history_index.get(title)): commit = Commit(cid.hex()) # Sanity check against find_original_email() before recommending mid = commit.find_original_email() if mid and mid == get_message_id(patch): missing[i] = commit # If the series has missing patches, report them. if not missing: continue # In --autosel-only mode, suppress reports for series where none of the # backports appear to be from AUTOSEL. if autosel_only and not any(c.is_autosel() for c in series.backports.values()): debug(f'Not reporting missing prerequisites of non-AUTOSEL commits {list(series.backports.values())}') continue yield (series.patches, sorted(series.backports.items()), sorted(missing.items())) def parse_args(argparser): """Adds common options and parses the command arguments.""" argparser.add_argument('--verbose', action='store_true', help='show debug messages') args = argparser.parse_args() if args.verbose: config.verbose = True res = subprocess.run(['git', 'log', '-1', config.start_of_history], check=False, capture_output=True, cwd=config.linux_dir) if res.returncode != 0: error('Run this script with the working directory in the kernel repo, or create a config.py that sets linux_dir.') return args