From 523bb0ea5d4ca23881d01974515b5dd52b66ce86 Mon Sep 17 00:00:00 2001
From: Ben Holt <bholt+github@edx.org>
Date: Thu, 17 Oct 2019 21:47:26 -0400
Subject: [PATCH] Add stable_bucketer helper tool (#21783)

Add stable_bucketer helper tool
---
 .../experiments/stable_bucketing.py           |   5 +
 scripts/stable_bucketer                       | 154 ++++++++++++++++++
 2 files changed, 159 insertions(+)
 create mode 100755 scripts/stable_bucketer

diff --git a/lms/djangoapps/experiments/stable_bucketing.py b/lms/djangoapps/experiments/stable_bucketing.py
index 0bc903ee8bf..b6709fd9639 100644
--- a/lms/djangoapps/experiments/stable_bucketing.py
+++ b/lms/djangoapps/experiments/stable_bucketing.py
@@ -1,6 +1,11 @@
 """
 An implementation of a stable bucketing algorithm that can be used
 to reliably group users into experiments.
+
+An implementation of this is available as a standalone command-line
+tool, `scripts/stable_bucketer`, which can both validate the
+bucketing of a username and generate recognizable usernames for
+particular experiment buckets for testing.
 """
 
 import hashlib
diff --git a/scripts/stable_bucketer b/scripts/stable_bucketer
new file mode 100755
index 00000000000..7f1712e0737
--- /dev/null
+++ b/scripts/stable_bucketer
@@ -0,0 +1,154 @@
+#!/usr/bin/env python3
+
+import argparse
+import hashlib
+import random
+import re
+import string
+#####
+
+
+###  Main  ###
+def main(args, env):
+    epilog = "Checks username bucketing for experiments and generates names for each experiment bucket.  Derived names include the base user name, experiment abbreviation, bucket number, and a short random string, separated with hyphens.  (v1.0)"
+    parser = argparse.ArgumentParser(epilog=epilog)
+    parser.add_argument(
+        "exp",
+        metavar="EXPERIMENT",
+        help="Experiment to bucket for.",
+    )
+    parser.add_argument(
+        "user",
+        nargs="?",
+        default=env.get("USER", ""),
+        metavar="NAME",
+        help="Base user name for bucketing, default is $USER.",
+    )
+    parser.add_argument(
+        "-a", "--abbrev",
+        metavar="EXP",
+        help="Experiment abbreviation for name generation.",
+    )
+    parser.add_argument(
+        "-b", "--buckets",
+        nargs="+",
+        type=int,
+        metavar="X",
+        help="Buckets to make names for, default is all buckets.",
+    )
+    parser.add_argument(
+        "-c", "--check-only",
+        action="store_true",
+        help="Just check what bucket the user is in, don't generate names.",
+    )
+    parser.add_argument(
+        "-n", "--number",
+        type=int,
+        default=2,
+        metavar="N",
+        help="Number of buckets, default is 2.",
+    )
+    parser.add_argument(
+        "--print-args",
+        action="store_true",
+        # help="Print arguments and computations, then exit.",
+        help=argparse.SUPPRESS,
+    )
+    my_args = parser.parse_args(sys.argv[1:])
+    bucket_number = my_args.number
+
+    hashed = hash_exp(my_args.exp, my_args.user)
+    digest = bucket_int(hashed)
+    bucket = digest % bucket_number
+
+    abbrev = my_args.abbrev
+    if abbrev is None:
+        abbrev = abbreviate(my_args.exp)
+
+    print("{user} is in bucket: {bucket}".format(user=my_args.user, bucket=bucket))
+    if my_args.print_args:
+        print("* Args:\n\t{my_args}\n* Computed:\n\tdigest: {digest} - hash: {hashed} - {abbrev}".format(**vars()))
+        return 0
+
+    if my_args.check_only:
+        return 0
+
+    bucket_list = my_args.buckets
+    if not bucket_list:
+        bucket_list = range(bucket_number)
+
+    # TODO: validate more of the arguments
+    # HACK: currently not enforcing the naming rules:
+    # - Username must be between 2 and 30 characters long.
+    # - Usernames can only contain letters (A-Z, a-z), numerals (0-9), underscores (_), and hyphens (-).)
+    print("Generated names:")
+    for i in bucket_list:
+        if i >= bucket_number:
+            print("    (Skipped {i}, experiment only has {bucket_number} buckets)".format(**vars()))
+            continue
+        print("    " + name_for(i, abbrev, my_args.exp, my_args.user, bucket_number))
+
+    return 0
+#####
+
+
+###  Helpers  ###
+def hash_exp(exp, name):
+    hasher = hashlib.md5()
+    hasher.update(exp.encode("utf-8"))
+    hasher.update(name.encode("utf-8"))
+    return hasher.hexdigest()
+
+
+def bucket_int(hashed):
+    s = re.sub("[0-7]", "0", hashed)
+    s = re.sub("[8-9a-f]", "1", s)
+    return int(s, 2)
+
+
+def name_for(bucket, abbrev, exp, name, number):
+    if abbrev:
+        abbrev += "-"
+    name_base = "{name}-{abbrev}{bucket}-".format(**vars())
+    tries = 100 * number
+    for _ in range(tries):
+        s = "".join([random.choice(string.digits + string.ascii_lowercase) for _ in range(5)])
+        n = name_base + s
+        b = bucket_int(hash_exp(exp, n)) % number
+        if bucket == b:
+            return n
+    else:
+        raise RuntimeError("Failed to generate a name for bucket {bucket} in {tries} tries".format(**vars()))
+
+
+def abbreviate(exp):
+    "Deterministically creates a ~3-6 letter abbreviation, using initials and trying to stay as recognizable as possible"
+    s = re.sub(r"[^0-9A-Za-z]+", "-", exp)  # drop symbols that aren't allowed in usernames (_ and - are allowed; this collapses them into -, simplifying some of the following)
+    if len(re.findall(r"(^|[-])\w", s)) >= 3:
+        # found at least a few word separators, use initials
+        s = re.sub(r"(^|[-])(\w)[^-]*", r"\2", s).lower()
+        s = re.sub(r"[-]", "", s)  # drop stray separators
+    elif len(re.findall(r"[A-Z][^A-Z]+", s)) >= 3:
+        # found at least a few capitalizations, use as initials, strip lowercase and junk
+        s = re.sub(r"[a-z-]", "", s).lower()
+    else:
+        s = re.sub(r"[-]", "", s).lower()  # drop junk
+        if len(s) > 6:
+            # drop vowels except first & last and let the shortener trim it down from there
+            s = s[0] + re.sub(r"[aeiou]", "", s[1:-1]) + s[-1]
+
+    if len(s) > 6:
+        # shorten abbreviation, keeping the beginning, middle, and last characters to preserve recognizability
+        half = (len(s) - 1) // 2  # -1 to bias toward early-middle letters
+        s = s[:2] + s[half - 1:half + 2] + s[-1:]
+    return s
+#####
+
+
+#####
+if __name__ == "__main__":
+    import os
+    import sys
+    xit = main(sys.argv, os.environ)
+    sys.exit(xit)
+#####
-- 
GitLab