From a6003991a5cabf48294c9c01806d789e3a791e17 Mon Sep 17 00:00:00 2001
From: Nimisha Asthagiri <nasthagiri@edx.org>
Date: Wed, 3 Dec 2014 21:41:41 -0500
Subject: [PATCH] MA-157 Mobile analytics end-to-end. Move app-generated
 context back into properties. Ignore BI events through segment's webhook.
 Ignore events without Data in Properties.

---
 common/djangoapps/track/views/segmentio.py    | 42 ++++++++++---------
 .../track/views/tests/test_segmentio.py       | 39 ++++++++++++-----
 lms/envs/aws.py                               |  9 +++-
 lms/envs/common.py                            |  1 +
 4 files changed, 60 insertions(+), 31 deletions(-)

diff --git a/common/djangoapps/track/views/segmentio.py b/common/djangoapps/track/views/segmentio.py
index 59e1823af2d..6221064e16a 100644
--- a/common/djangoapps/track/views/segmentio.py
+++ b/common/djangoapps/track/views/segmentio.py
@@ -24,6 +24,7 @@ WARNING_IGNORED_TYPE = 'Type ignored'
 ERROR_MISSING_USER_ID = 'Required user_id missing from context'
 ERROR_USER_NOT_EXIST = 'Specified user does not exist'
 ERROR_INVALID_USER_ID = 'Unable to parse userId as an integer'
+ERROR_MISSING_DATA = 'The data field must be specified in the properties dictionary'
 ERROR_MISSING_NAME = 'The name field must be specified in the properties dictionary'
 ERROR_MISSING_TIMESTAMP = 'Required timestamp field not found'
 ERROR_MISSING_RECEIVED_AT = 'Required receivedAt field not found'
@@ -119,7 +120,7 @@ def track_segmentio_event(request):  # pylint: disable=too-many-statements
     full_segment_event = request.json
 
     # We mostly care about the properties
-    segment_event = full_segment_event.get('properties', {})
+    segment_properties = full_segment_event.get('properties', {})
 
     # Start with the context provided by segment.io in the "client" field if it exists
     # We should tightly control which fields actually get included in the event emitted.
@@ -136,32 +137,38 @@ def track_segmentio_event(request):  # pylint: disable=too-many-statements
     else:
         context['event_source'] = event_source
 
-    # Ignore types that are unsupported
+    if 'name' not in segment_properties:
+        raise EventValidationError(ERROR_MISSING_NAME)
+
+    if 'data' not in segment_properties:
+        raise EventValidationError(ERROR_MISSING_DATA)
+
+    # Ignore event types and names that are unsupported
     segment_event_type = full_segment_event.get('type')
+    segment_event_name = segment_properties['name']
     allowed_types = [a.lower() for a in getattr(settings, 'TRACKING_SEGMENTIO_ALLOWED_TYPES', [])]
-    if not segment_event_type or segment_event_type.lower() not in allowed_types:
+    disallowed_substring_names = [
+        a.lower() for a in getattr(settings, 'TRACKING_SEGMENTIO_DISALLOWED_SUBSTRING_NAMES', [])
+    ]
+    if (
+        not segment_event_type or
+        (segment_event_type.lower() not in allowed_types) or
+        any(disallowed_subs_name in segment_event_name.lower() for disallowed_subs_name in disallowed_substring_names)
+    ):
         raise EventValidationError(WARNING_IGNORED_TYPE)
 
     if segment_context:
-        # copy required fields from segment's context dict to our custom context dict
-        for context_field_name, segment_context_field_name in [
-            ('course_id', 'course_id'),
-            ('open_in_browser_url', 'open_in_browser_url'),
-            ('agent', 'userAgent')
-        ]:
-            if segment_context_field_name in segment_context:
-                context[context_field_name] = segment_context[segment_context_field_name]
-
         # copy the entire segment's context dict as a sub-field of our custom context dict
         context['client'] = dict(segment_context)
+        context['agent'] = segment_context.get('userAgent', '')
 
         # remove duplicate and unnecessary fields from our copy
-        for field in ('traits', 'integrations', 'userAgent', 'course_id', 'open_in_browser_url'):
+        for field in ('traits', 'integrations', 'userAgent'):
             if field in context['client']:
                 del context['client'][field]
 
     # Overlay any context provided in the properties
-    context.update(segment_event.get('context', {}))
+    context.update(segment_properties.get('context', {}))
 
     user_id = full_segment_event.get('userId')
     if not user_id:
@@ -203,13 +210,10 @@ def track_segmentio_event(request):  # pylint: disable=too-many-statements
     else:
         raise EventValidationError(ERROR_MISSING_RECEIVED_AT)
 
-    if 'name' not in segment_event:
-        raise EventValidationError(ERROR_MISSING_NAME)
-
-    context['ip'] = segment_event.get('context', {}).get('ip', '')
+    context['ip'] = segment_properties.get('context', {}).get('ip', '')
 
     with tracker.get_tracker().context('edx.segmentio', context):
-        tracker.emit(segment_event['name'], segment_event.get('data', {}))
+        tracker.emit(segment_event_name, segment_properties.get('data', {}))
 
 
 def parse_iso8601_timestamp(timestamp):
diff --git a/common/djangoapps/track/views/tests/test_segmentio.py b/common/djangoapps/track/views/tests/test_segmentio.py
index 1a02dfb6f5e..7f90be0e09a 100644
--- a/common/djangoapps/track/views/tests/test_segmentio.py
+++ b/common/djangoapps/track/views/tests/test_segmentio.py
@@ -44,6 +44,7 @@ def expect_failure_with_message(message):
     TRACKING_SEGMENTIO_WEBHOOK_SECRET=SECRET,
     TRACKING_IGNORE_URL_PATTERNS=[ENDPOINT],
     TRACKING_SEGMENTIO_ALLOWED_TYPES=['track'],
+    TRACKING_SEGMENTIO_DISALLOWED_SUBSTRING_NAMES=['.bi.'],
     TRACKING_SEGMENTIO_SOURCE_MAP={'test-app': 'mobile'},
     EVENT_TRACKING_PROCESSORS=MOBILE_SHIM_PROCESSOR,
 )
@@ -97,6 +98,11 @@ class SegmentIOTrackingTestCase(EventTrackingTestCase):
     def test_segmentio_ignore_actions(self, action):
         self.post_segmentio_event(action=action)
 
+    @data('edx.bi.some_name', 'EDX.BI.CAPITAL_NAME')
+    @expect_failure_with_message(segmentio.WARNING_IGNORED_TYPE)
+    def test_segmentio_ignore_names(self, name):
+        self.post_segmentio_event(name=name)
+
     def post_segmentio_event(self, **kwargs):
         """Post a fake segment.io event to the view that processes it"""
         request = self.create_request(
@@ -114,6 +120,9 @@ class SegmentIOTrackingTestCase(EventTrackingTestCase):
             "properties": {
                 'name': kwargs.get('name', str(sentinel.name)),
                 'data': kwargs.get('data', {}),
+                'context': {
+                    'course_id': kwargs.get('course_id') or '',
+                }
             },
             "channel": 'server',
             "context": {
@@ -122,7 +131,6 @@ class SegmentIOTrackingTestCase(EventTrackingTestCase):
                     "version": "unknown"
                 },
                 'userAgent': str(sentinel.user_agent),
-                'course_id': kwargs.get('course_id') or '',
             },
             "receivedAt": "2014-08-27T16:33:39.100Z",
             "timestamp": "2014-08-27T16:33:39.215Z",
@@ -139,10 +147,7 @@ class SegmentIOTrackingTestCase(EventTrackingTestCase):
         }
 
         if 'context' in kwargs:
-            sample_event['context'].update(kwargs['context'])
-
-        if 'open_in_browser_url' in kwargs:
-            sample_event['context']['open_in_browser_url'] = kwargs['open_in_browser_url']
+            sample_event['properties']['context'].update(kwargs['context'])
 
         return sample_event
 
@@ -231,6 +236,18 @@ class SegmentIOTrackingTestCase(EventTrackingTestCase):
 
         segmentio.track_segmentio_event(request)
 
+    @expect_failure_with_message(segmentio.ERROR_MISSING_DATA)
+    def test_missing_data(self):
+        sample_event_raw = self.create_segmentio_event()
+        del sample_event_raw['properties']['data']
+        request = self.create_request(
+            data=json.dumps(sample_event_raw),
+            content_type='application/json'
+        )
+        User.objects.create(pk=USER_ID, username=str(sentinel.username))
+
+        segmentio.track_segmentio_event(request)
+
     @expect_failure_with_message(segmentio.ERROR_MISSING_TIMESTAMP)
     def test_missing_timestamp(self):
         sample_event_raw = self.create_event_without_fields('timestamp')
@@ -305,8 +322,8 @@ class SegmentIOTrackingTestCase(EventTrackingTestCase):
             data=self.create_segmentio_event_json(
                 name=name,
                 data=input_payload,
-                open_in_browser_url='https://testserver/courses/foo/bar/baz/courseware/Week_1/Activity/2',
                 context={
+                    'open_in_browser_url': 'https://testserver/courses/foo/bar/baz/courseware/Week_1/Activity/2',
                     'course_id': course_id,
                     'application': {
                         'name': 'edx.mobileapp.android',
@@ -344,11 +361,11 @@ class SegmentIOTrackingTestCase(EventTrackingTestCase):
                             'name': 'test-app',
                             'version': 'unknown'
                         },
-                        'application': {
-                            'name': 'edx.mobileapp.android',
-                            'version': '29',
-                            'component': 'videoplayer'
-                        }
+                    },
+                    'application': {
+                        'name': 'edx.mobileapp.android',
+                        'version': '29',
+                        'component': 'videoplayer'
                     },
                     'received_at': datetime.strptime("2014-08-27T16:33:39.100Z", "%Y-%m-%dT%H:%M:%S.%fZ"),
                 },
diff --git a/lms/envs/aws.py b/lms/envs/aws.py
index bacd50a31ef..db98070efca 100644
--- a/lms/envs/aws.py
+++ b/lms/envs/aws.py
@@ -397,8 +397,15 @@ STUDENT_FILEUPLOAD_MAX_SIZE = ENV_TOKENS.get("STUDENT_FILEUPLOAD_MAX_SIZE", STUD
 # Event tracking
 TRACKING_BACKENDS.update(AUTH_TOKENS.get("TRACKING_BACKENDS", {}))
 EVENT_TRACKING_BACKENDS.update(AUTH_TOKENS.get("EVENT_TRACKING_BACKENDS", {}))
-TRACKING_SEGMENTIO_WEBHOOK_SECRET = AUTH_TOKENS.get("TRACKING_SEGMENTIO_WEBHOOK_SECRET", TRACKING_SEGMENTIO_WEBHOOK_SECRET)
+TRACKING_SEGMENTIO_WEBHOOK_SECRET = AUTH_TOKENS.get(
+    "TRACKING_SEGMENTIO_WEBHOOK_SECRET",
+    TRACKING_SEGMENTIO_WEBHOOK_SECRET
+)
 TRACKING_SEGMENTIO_ALLOWED_TYPES = ENV_TOKENS.get("TRACKING_SEGMENTIO_ALLOWED_TYPES", TRACKING_SEGMENTIO_ALLOWED_TYPES)
+TRACKING_SEGMENTIO_DISALLOWED_SUBSTRING_NAMES = ENV_TOKENS.get(
+    "TRACKING_SEGMENTIO_DISALLOWED_SUBSTRING_NAMES",
+    TRACKING_SEGMENTIO_DISALLOWED_SUBSTRING_NAMES
+)
 TRACKING_SEGMENTIO_SOURCE_MAP = ENV_TOKENS.get("TRACKING_SEGMENTIO_SOURCE_MAP", TRACKING_SEGMENTIO_SOURCE_MAP)
 
 # Student identity verification settings
diff --git a/lms/envs/common.py b/lms/envs/common.py
index 6b260e44c02..9e0c63f6cf1 100644
--- a/lms/envs/common.py
+++ b/lms/envs/common.py
@@ -540,6 +540,7 @@ if FEATURES.get('ENABLE_SQL_TRACKING_LOGS'):
 
 TRACKING_SEGMENTIO_WEBHOOK_SECRET = None
 TRACKING_SEGMENTIO_ALLOWED_TYPES = ['track']
+TRACKING_SEGMENTIO_DISALLOWED_SUBSTRING_NAMES = ['.bi.']
 TRACKING_SEGMENTIO_SOURCE_MAP = {
     'analytics-android': 'mobile',
     'analytics-ios': 'mobile',
-- 
GitLab