diff --git a/lms/djangoapps/experiments/stable_bucketing.py b/lms/djangoapps/experiments/stable_bucketing.py index 0bc903ee8bfea0d0c3000bfc7b48e10f3c760dbb..b6709fd9639b10fd864e37fa475bbd20ce3bf675 100644 --- a/lms/djangoapps/experiments/stable_bucketing.py +++ b/lms/djangoapps/experiments/stable_bucketing.py @@ -1,6 +1,11 @@ """ An implementation of a stable bucketing algorithm that can be used to reliably group users into experiments. + +An implementation of this is available as a standalone command-line +tool, `scripts/stable_bucketer`, which can both validate the +bucketing of a username and generate recognizable usernames for +particular experiment buckets for testing. """ import hashlib diff --git a/scripts/stable_bucketer b/scripts/stable_bucketer new file mode 100755 index 0000000000000000000000000000000000000000..7f1712e07371acc59d639c534637988e191f6c35 --- /dev/null +++ b/scripts/stable_bucketer @@ -0,0 +1,154 @@ +#!/usr/bin/env python3 + +import argparse +import hashlib +import random +import re +import string +##### + + +### Main ### +def main(args, env): + epilog = "Checks username bucketing for experiments and generates names for each experiment bucket. Derived names include the base user name, experiment abbreviation, bucket number, and a short random string, separated with hyphens. (v1.0)" + parser = argparse.ArgumentParser(epilog=epilog) + parser.add_argument( + "exp", + metavar="EXPERIMENT", + help="Experiment to bucket for.", + ) + parser.add_argument( + "user", + nargs="?", + default=env.get("USER", ""), + metavar="NAME", + help="Base user name for bucketing, default is $USER.", + ) + parser.add_argument( + "-a", "--abbrev", + metavar="EXP", + help="Experiment abbreviation for name generation.", + ) + parser.add_argument( + "-b", "--buckets", + nargs="+", + type=int, + metavar="X", + help="Buckets to make names for, default is all buckets.", + ) + parser.add_argument( + "-c", "--check-only", + action="store_true", + help="Just check what bucket the user is in, don't generate names.", + ) + parser.add_argument( + "-n", "--number", + type=int, + default=2, + metavar="N", + help="Number of buckets, default is 2.", + ) + parser.add_argument( + "--print-args", + action="store_true", + # help="Print arguments and computations, then exit.", + help=argparse.SUPPRESS, + ) + my_args = parser.parse_args(sys.argv[1:]) + bucket_number = my_args.number + + hashed = hash_exp(my_args.exp, my_args.user) + digest = bucket_int(hashed) + bucket = digest % bucket_number + + abbrev = my_args.abbrev + if abbrev is None: + abbrev = abbreviate(my_args.exp) + + print("{user} is in bucket: {bucket}".format(user=my_args.user, bucket=bucket)) + if my_args.print_args: + print("* Args:\n\t{my_args}\n* Computed:\n\tdigest: {digest} - hash: {hashed} - {abbrev}".format(**vars())) + return 0 + + if my_args.check_only: + return 0 + + bucket_list = my_args.buckets + if not bucket_list: + bucket_list = range(bucket_number) + + # TODO: validate more of the arguments + # HACK: currently not enforcing the naming rules: + # - Username must be between 2 and 30 characters long. + # - Usernames can only contain letters (A-Z, a-z), numerals (0-9), underscores (_), and hyphens (-).) + print("Generated names:") + for i in bucket_list: + if i >= bucket_number: + print(" (Skipped {i}, experiment only has {bucket_number} buckets)".format(**vars())) + continue + print(" " + name_for(i, abbrev, my_args.exp, my_args.user, bucket_number)) + + return 0 +##### + + +### Helpers ### +def hash_exp(exp, name): + hasher = hashlib.md5() + hasher.update(exp.encode("utf-8")) + hasher.update(name.encode("utf-8")) + return hasher.hexdigest() + + +def bucket_int(hashed): + s = re.sub("[0-7]", "0", hashed) + s = re.sub("[8-9a-f]", "1", s) + return int(s, 2) + + +def name_for(bucket, abbrev, exp, name, number): + if abbrev: + abbrev += "-" + name_base = "{name}-{abbrev}{bucket}-".format(**vars()) + tries = 100 * number + for _ in range(tries): + s = "".join([random.choice(string.digits + string.ascii_lowercase) for _ in range(5)]) + n = name_base + s + b = bucket_int(hash_exp(exp, n)) % number + if bucket == b: + return n + else: + raise RuntimeError("Failed to generate a name for bucket {bucket} in {tries} tries".format(**vars())) + + +def abbreviate(exp): + "Deterministically creates a ~3-6 letter abbreviation, using initials and trying to stay as recognizable as possible" + s = re.sub(r"[^0-9A-Za-z]+", "-", exp) # drop symbols that aren't allowed in usernames (_ and - are allowed; this collapses them into -, simplifying some of the following) + if len(re.findall(r"(^|[-])\w", s)) >= 3: + # found at least a few word separators, use initials + s = re.sub(r"(^|[-])(\w)[^-]*", r"\2", s).lower() + s = re.sub(r"[-]", "", s) # drop stray separators + elif len(re.findall(r"[A-Z][^A-Z]+", s)) >= 3: + # found at least a few capitalizations, use as initials, strip lowercase and junk + s = re.sub(r"[a-z-]", "", s).lower() + else: + s = re.sub(r"[-]", "", s).lower() # drop junk + if len(s) > 6: + # drop vowels except first & last and let the shortener trim it down from there + s = s[0] + re.sub(r"[aeiou]", "", s[1:-1]) + s[-1] + + if len(s) > 6: + # shorten abbreviation, keeping the beginning, middle, and last characters to preserve recognizability + half = (len(s) - 1) // 2 # -1 to bias toward early-middle letters + s = s[:2] + s[half - 1:half + 2] + s[-1:] + return s +##### + + +##### +if __name__ == "__main__": + import os + import sys + xit = main(sys.argv, os.environ) + sys.exit(xit) +#####