From a022461891bb664f4e8b8e1e84c37f2a8a83bb3f Mon Sep 17 00:00:00 2001 From: Jeremy Bowman <jbowman@edx.org> Date: Wed, 14 Feb 2018 17:48:25 -0500 Subject: [PATCH] PLAT-1957 Better memory leak diagnostic tooling --- cms/envs/common.py | 11 +- cms/envs/devstack.py | 13 +- lms/envs/common.py | 5 +- lms/envs/devstack.py | 16 +- lms/envs/load_test.py | 2 + .../djangoapps/monitoring_utils/__init__.py | 1 + .../core/djangoapps/monitoring_utils/apps.py | 16 + .../djangoapps/monitoring_utils/middleware.py | 22 +- .../djangoapps/monitoring_utils/signals.py | 20 + .../core/djangoapps/monitoring_utils/utils.py | 495 ++++++++++++++++++ openedx/tests/settings.py | 1 + requirements/edx/base.txt | 2 + 12 files changed, 582 insertions(+), 22 deletions(-) create mode 100644 openedx/core/djangoapps/monitoring_utils/apps.py create mode 100644 openedx/core/djangoapps/monitoring_utils/signals.py create mode 100644 openedx/core/djangoapps/monitoring_utils/utils.py diff --git a/cms/envs/common.py b/cms/envs/common.py index 5945a1be3b3..efed5ff60f2 100644 --- a/cms/envs/common.py +++ b/cms/envs/common.py @@ -43,6 +43,7 @@ from __future__ import absolute_import import imp import os +import tempfile import sys from datetime import timedelta @@ -322,7 +323,6 @@ GEOIPV6_PATH = REPO_ROOT / "common/static/data/geoip/GeoIPv6.dat" ############################# TEMPLATE CONFIGURATION ############################# # Mako templating -import tempfile MAKO_MODULE_DIR = os.path.join(tempfile.gettempdir(), 'mako_cms') MAKO_TEMPLATE_DIRS_BASE = [ PROJECT_ROOT / 'templates', @@ -447,11 +447,10 @@ else: _csrf_middleware = 'django.middleware.csrf.CsrfViewMiddleware' MIDDLEWARE_CLASSES = [ + 'openedx.core.djangoapps.monitoring_utils.middleware.MonitoringMemoryMiddleware', 'crum.CurrentRequestUserMiddleware', 'openedx.core.djangoapps.request_cache.middleware.RequestCache', - 'openedx.core.djangoapps.monitoring_utils.middleware.MonitoringMemoryMiddleware', - 'openedx.core.djangoapps.header_control.middleware.HeaderControlMiddleware', 'django.middleware.cache.UpdateCacheMiddleware', 'django.middleware.common.CommonMiddleware', @@ -515,6 +514,9 @@ X_FRAME_OPTIONS = 'ALLOW' # Platform for Privacy Preferences header P3P_HEADER = 'CP="Open EdX does not have a P3P policy."' +# Let MemoryUsageData create the directory as needed +MEMORY_GRAPH_DIRECTORY = None + ############# XBlock Configuration ########## # Import after sys.path fixup @@ -1127,6 +1129,9 @@ INSTALLED_APPS = [ # Asset management for mako templates 'pipeline_mako', + + # Memory leak diagnostics + 'openedx.core.djangoapps.monitoring_utils.apps.MonitoringUtilsConfig' ] diff --git a/cms/envs/devstack.py b/cms/envs/devstack.py index 8f675cdcf6b..737304b073a 100644 --- a/cms/envs/devstack.py +++ b/cms/envs/devstack.py @@ -2,6 +2,7 @@ Specific overrides to the base prod settings to make development easier. """ +import logging from os.path import abspath, dirname, join from .aws import * # pylint: disable=wildcard-import, unused-wildcard-import @@ -18,8 +19,6 @@ HTTPS = 'off' ################################ LOGGERS ###################################### -import logging - # Disable noisy loggers for pkg_name in ['track.contexts', 'track.middleware', 'dd.dogapi']: logging.getLogger(pkg_name).setLevel(logging.CRITICAL) @@ -95,16 +94,22 @@ DEBUG_TOOLBAR_CONFIG = { def should_show_debug_toolbar(request): - # We always want the toolbar on devstack unless running tests from another Docker container + # We always want the toolbar on devstack unless running tests from another + # Docker container or actively diagnosing a memory leak if request.get_host().startswith('edx.devstack.studio:'): return False - return True + from openedx.core.djangoapps.monitoring_utils import MemoryUsageData + return not MemoryUsageData.tables_are_enabled # To see stacktraces for MongoDB queries, set this to True. # Stacktraces slow down page loads drastically (for pages with lots of queries). DEBUG_TOOLBAR_MONGO_STACKTRACES = False +############################## MEMORY MONITORING ############################## + +MEMORY_GRAPH_DIRECTORY = REPO_ROOT / 'test_root' / 'log' / 'memory_graphs' / 'cms_{}'.format(os.getpid()) +WSGI_APPLICATION = 'openedx.core.djangoapps.monitoring_utils.WSGIServer' ################################ MILESTONES ################################ FEATURES['MILESTONES_APP'] = True diff --git a/lms/envs/common.py b/lms/envs/common.py index 0bb43f21472..f3506a61360 100644 --- a/lms/envs/common.py +++ b/lms/envs/common.py @@ -32,6 +32,7 @@ Longer TODO: import imp import sys import os +import tempfile import django from path import Path as path @@ -543,7 +544,6 @@ OAUTH2_PROVIDER_APPLICATION_MODEL = 'oauth2_provider.Application' ################################## TEMPLATE CONFIGURATION ##################################### # Mako templating -import tempfile MAKO_MODULE_DIR = os.path.join(tempfile.gettempdir(), 'mako_lms') MAKO_TEMPLATE_DIRS_BASE = [ PROJECT_ROOT / 'templates', @@ -1341,6 +1341,9 @@ X_FRAME_OPTIONS = 'ALLOW' # Platform for Privacy Preferences header P3P_HEADER = 'CP="Open EdX does not have a P3P policy."' +# Let MemoryUsageData create the directory as needed +MEMORY_GRAPH_DIRECTORY = None + ############################### PIPELINE ####################################### PIPELINE_ENABLED = True diff --git a/lms/envs/devstack.py b/lms/envs/devstack.py index 9d7b5cc62e1..f2c61810c7d 100644 --- a/lms/envs/devstack.py +++ b/lms/envs/devstack.py @@ -1,6 +1,7 @@ """ Specific overrides to the base prod settings to make development easier. """ +import logging from os.path import abspath, dirname, join from .aws import * # pylint: disable=wildcard-import, unused-wildcard-import @@ -26,7 +27,6 @@ ENTERPRISE_API_URL = LMS_INTERNAL_ROOT_URL + '/enterprise/api/v1/' ################################ LOGGERS ###################################### # Silence noisy logs -import logging LOG_OVERRIDES = [ ('track.contexts', logging.CRITICAL), ('track.middleware', logging.CRITICAL), @@ -88,10 +88,20 @@ DEBUG_TOOLBAR_CONFIG = { def should_show_debug_toolbar(request): - # We always want the toolbar on devstack unless running tests from another Docker container + # We always want the toolbar on devstack unless running tests from another + # Docker container or actively diagnosing a memory leak if request.get_host().startswith('edx.devstack.lms:'): return False - return True + from openedx.core.djangoapps.monitoring_utils import MemoryUsageData + return not MemoryUsageData.tables_are_enabled + + +############################## MEMORY MONITORING ############################## + +INSTALLED_APPS.append('openedx.core.djangoapps.monitoring_utils.apps.MonitoringUtilsConfig') +MEMORY_GRAPH_DIRECTORY = REPO_ROOT / 'test_root' / 'log' / 'memory_graphs' / 'lms_{}'.format(os.getpid()) +MIDDLEWARE_CLASSES.insert(0, 'openedx.core.djangoapps.monitoring_utils.middleware.MonitoringMemoryMiddleware') +WSGI_APPLICATION = 'openedx.core.djangoapps.monitoring_utils.WSGIServer' ########################### PIPELINE ################################# diff --git a/lms/envs/load_test.py b/lms/envs/load_test.py index cfc90add27d..1ec7b6dffe5 100644 --- a/lms/envs/load_test.py +++ b/lms/envs/load_test.py @@ -20,4 +20,6 @@ EXCLUDE_CSRF = lambda elem: elem not in [ DEFAULT_TEMPLATE_ENGINE['OPTIONS']['context_processors'] = filter( EXCLUDE_CSRF, DEFAULT_TEMPLATE_ENGINE['OPTIONS']['context_processors'] ) +INSTALLED_APPS.append('openedx.core.djangoapps.monitoring_utils.apps.MonitoringUtilsConfig') +MIDDLEWARE_CLASSES.insert(0, 'openedx.core.djangoapps.monitoring_utils.middleware.MonitoringMemoryMiddleware') MIDDLEWARE_CLASSES = filter(EXCLUDE_CSRF, MIDDLEWARE_CLASSES) diff --git a/openedx/core/djangoapps/monitoring_utils/__init__.py b/openedx/core/djangoapps/monitoring_utils/__init__.py index ed653274786..957fe8a18ae 100644 --- a/openedx/core/djangoapps/monitoring_utils/__init__.py +++ b/openedx/core/djangoapps/monitoring_utils/__init__.py @@ -23,6 +23,7 @@ TODO: supply additional public functions for storing strings and booleans. from contextlib import contextmanager from . import middleware +from .utils import MemoryUsageData, WSGIServer try: import newrelic.agent except ImportError: diff --git a/openedx/core/djangoapps/monitoring_utils/apps.py b/openedx/core/djangoapps/monitoring_utils/apps.py new file mode 100644 index 00000000000..2b995421f63 --- /dev/null +++ b/openedx/core/djangoapps/monitoring_utils/apps.py @@ -0,0 +1,16 @@ +""" +Monitoring Utilities Configuration +""" +from __future__ import absolute_import + +from django.apps import AppConfig + + +class MonitoringUtilsConfig(AppConfig): + """ + Default configuration for the "openedx.core.djangoapps.monitoring_utils" Django application. + """ + name = u'openedx.core.djangoapps.monitoring_utils' + + def ready(self): + from . import signals # pylint: disable=unused-variable diff --git a/openedx/core/djangoapps/monitoring_utils/middleware.py b/openedx/core/djangoapps/monitoring_utils/middleware.py index 433f31be4c3..70e6eb4404b 100644 --- a/openedx/core/djangoapps/monitoring_utils/middleware.py +++ b/openedx/core/djangoapps/monitoring_utils/middleware.py @@ -13,6 +13,7 @@ import logging from uuid import uuid4 import psutil +from six import text_type from openedx.core.djangoapps.request_cache import get_cache from openedx.core.djangoapps.waffle_utils import WaffleSwitchNamespace @@ -89,27 +90,26 @@ class MonitoringMemoryMiddleware(object): guid_key = u'guid_key' def process_request(self, request): + """ + Record pre-request memory usage data + """ if self._is_enabled(): - self._cache[self.guid_key] = unicode(uuid4()) + setattr(request, self.guid_key, text_type(uuid4())) log_prefix = self._log_prefix(u"Before", request) - self._cache[self.memory_data_key] = self._memory_data(log_prefix) + setattr(request, self.memory_data_key, self._memory_data(log_prefix)) def process_response(self, request, response): + """ + Record post-request memory usage data + """ if self._is_enabled(): log_prefix = self._log_prefix(u"After", request) new_memory_data = self._memory_data(log_prefix) log_prefix = self._log_prefix(u"Diff", request) - self._log_diff_memory_data(log_prefix, new_memory_data, self._cache.get(self.memory_data_key)) + self._log_diff_memory_data(log_prefix, new_memory_data, getattr(request, self.memory_data_key)) return response - @property - def _cache(self): - """ - Namespaced request cache for tracking memory usage. - """ - return get_cache(name='monitoring_memory') - def _log_prefix(self, prefix, request): """ Returns a formatted prefix for logging for the given request. @@ -117,7 +117,7 @@ class MonitoringMemoryMiddleware(object): # After a celery task runs, the request cache is cleared. So if celery # tasks are running synchronously (CELERY_ALWAYS _EAGER), "guid_key" # will no longer be in the request cache when process_response executes. - cached_guid = self._cache.get(self.guid_key) or u"without_guid" + cached_guid = getattr(request, self.guid_key) or u"without_guid" return u"{} request '{} {} {}'".format(prefix, request.method, request.path, cached_guid) def _memory_data(self, log_prefix): diff --git a/openedx/core/djangoapps/monitoring_utils/signals.py b/openedx/core/djangoapps/monitoring_utils/signals.py new file mode 100644 index 00000000000..f9cb50dd3fa --- /dev/null +++ b/openedx/core/djangoapps/monitoring_utils/signals.py @@ -0,0 +1,20 @@ +""" +Memory leak troubleshooting via request lifecycle signals. +""" + +from __future__ import absolute_import + +from django.core.signals import request_started +from django.dispatch import receiver + +from . import MemoryUsageData + + +@receiver(request_started) +def reset_memory_statistics(sender, **kwargs): # pylint: disable=unused-argument + """ + Use Django's signal for the start of request processing as the trigger to + start tracking new objects in memory when the + ``monitoring_utils.log_memory_tables`` Waffle switch is enabled. + """ + MemoryUsageData.start_counting() diff --git a/openedx/core/djangoapps/monitoring_utils/utils.py b/openedx/core/djangoapps/monitoring_utils/utils.py new file mode 100644 index 00000000000..4d594835ea7 --- /dev/null +++ b/openedx/core/djangoapps/monitoring_utils/utils.py @@ -0,0 +1,495 @@ +""" +Monitoring utilities which aren't used by the application by default, but can +be used as needed to troubleshoot problems. +""" +from __future__ import absolute_import, print_function + +import gc +import itertools +import logging +import operator +import os +import socket +import sys +import tempfile +from StringIO import StringIO +from collections import defaultdict + +from django.conf import settings +from django.core.servers.basehttp import WSGIServer as DjangoWSGIServer +from django.utils.lru_cache import lru_cache + +import gunicorn.util +from objgraph import ( + _long_typename, + _short_typename, + at_addrs, + show_backrefs, + show_refs, +) + +from openedx.core.djangoapps.waffle_utils import WaffleSwitchNamespace + +indices = defaultdict(itertools.count) + +# The directory in which graph files will be created. +GRAPH_DIRECTORY_PATH = settings.MEMORY_GRAPH_DIRECTORY + +# The max number of object types for which to show data on the console +MAX_CONSOLE_ROWS = 30 + +# The max number of object types for which to generate reference graphs +MAX_GRAPHED_OBJECT_TYPES = 5 + +# Maximum depth of forward reference graphs +REFS_DEPTH = 3 + +# Maximum depth of backward reference graphs +BACK_REFS_DEPTH = 8 + +# Max number of objects per type to use as starting points in the reference graphs +MAX_OBJECTS_PER_TYPE = 10 + +# Object type names for which table rows and graphs should not be generated if +# the new object count is below the given threshold. "set" is ignored by +# default because many sets are created in the course of tracking the number +# of new objects of each type. "ApdexStats", "SplitResult", and "TimeStats" +# are New Relic data which sometimes outlives the duration of the request but +# usually doesn't stick around long-term. +IGNORE_THRESHOLDS = { + 'ApdexStats': 10, + 'SplitResult': 50, + 'TimeStats': 500, + 'set': 10000, +} + +WAFFLE_NAMESPACE = 'monitoring_utils' + +log = logging.getLogger(__name__) + + +def show_memory_leaks( + label=u'memory_leaks', + max_console_rows=MAX_CONSOLE_ROWS, + max_graphed_object_types=MAX_GRAPHED_OBJECT_TYPES, + refs_depth=REFS_DEPTH, + back_refs_depth=BACK_REFS_DEPTH, + max_objects_per_type=MAX_OBJECTS_PER_TYPE, + ignore_thresholds=None, + graph_directory_path=GRAPH_DIRECTORY_PATH, + memory_table_buffer=None, + skip_first_graphs=True): + """ + Call this function to get data about memory leaks; what objects are being + leaked, where did they come from, and what do they contain? The leaks + are measured from the last call to ``get_new_ids()`` (which is called + within this function). Some data is printed to stdout, and more details + are available in graphs stored at the paths printed to stdout. Subsequent + calls with the same label are indicated by an increasing index in the + filename. + + Args: + label (unicode): The start of the filename for each graph + max_console_rows (int): The max number of object types for which to + show data on the console + max_graphed_object_types (int): The max number of object types for + which to generate reference graphs + refs_depth (int): Maximum depth of forward reference graphs + back_refs_depth (int): Maximum depth of backward reference graphs + max_objects_per_type (int): Max number of objects per type to use as + starting points in the reference graphs + ignore_thresholds (dict): Object type names for which table rows and + graphs should not be generated if the new object count is below + the corresponding number. + graph_directory_path (unicode): The directory in which graph files + will be created. It will be created if it doesn't already exist. + memory_table_buffer (StringIO): Storage for the generated table of + memory statistics. Ideally, create this before starting to + count newly allocated objects. + skip_first_graphs (bool): True if the first call to this function for + a given label should not produce graphs (the default behavior). + The first call to a given block of code often initializes an + assortment of objects which aren't really leaked memory. + """ + if graph_directory_path is None: + graph_directory_path = MemoryUsageData.graph_directory_path() + if ignore_thresholds is None: + ignore_thresholds = IGNORE_THRESHOLDS + if memory_table_buffer is None: + memory_table_buffer = StringIO() + new_ids = get_new_ids(limit=max_console_rows, ignore_thresholds=ignore_thresholds, + output=memory_table_buffer) + memory_table_text = memory_table_buffer.getvalue() + log.info('\n' + memory_table_text) + + if not os.path.exists(graph_directory_path): + os.makedirs(graph_directory_path) + label = label.replace(':', '_') + index = indices[label].next() + 1 + data = {'label': label, 'index': index} + path = os.path.join(graph_directory_path, u'{label}_{index}.txt'.format(**data)) + with open(path, 'w') as f: + f.write(memory_table_text) + + if index == 1 and skip_first_graphs: + return + + graphed_types = 0 + sorted_by_count = sorted(new_ids.items(), key=lambda entry: len(entry[1]), reverse=True) + for item in sorted_by_count: + type_name = item[0] + object_ids = new_ids[type_name] + if not object_ids: + continue + objects = at_addrs(list(object_ids)[:max_objects_per_type]) + data['type_name'] = type_name + + if back_refs_depth > 0: + path = os.path.join(graph_directory_path, u'{label}_{index}_{type_name}_backrefs.dot'.format(**data)) + show_backrefs(objects, max_depth=back_refs_depth, filename=path) + log.info('Generated memory graph at {}'.format(path)) + + if refs_depth > 0: + path = os.path.join(graph_directory_path, u'{label}_{index}_{type_name}_refs.dot'.format(**data)) + show_refs(objects, max_depth=refs_depth, filename=path) + log.info('Generated memory graph at {}'.format(path)) + + graphed_types += 1 + if graphed_types >= max_graphed_object_types: + break + + +class MemoryUsageData(object): + """ + Memory analysis data and configuration options for the current request. + Do *NOT* use this in production; it slows down most requests by about an + order of magnitude, even the ones which aren't being specifically studied. + + Call ``MemoryUsageData.analyze()`` from a view and enable the appropriate + waffle switch(es) to start generating memory leak diagnostic information: + + * monitoring_utils.log_memory_tables - Log a table of data on object types + for which the total number in memory increased during the request. + Enabling this switch disables Django Debug Toolbar, since it leaks + many objects with every request. + + * monitoring_utils.create_memory_graphs - Also generate reference graphs + for some of the apparently leaked objects. + + When using this in development via Django's runserver command, be sure to + pass it the ``--nothreading`` option to avoid concurrent memory changes + while serving static assets. In devstack, do this in docker-compose.yml. + + To use this feature on a sandbox, you also need to append + ``openedx.core.djangoapps.monitoring_utils.apps.MonitoringUtilsConfig`` to + the end of the ``INSTALLED_APPS`` Django setting. This is present by + default for devstack and load test environments, but absent from the + ``aws`` settings module to avoid a little overhead at the start of each + request even when the Waffle switches are disabled (mainly just to load + the switch from the database). + + Configuration options for the depth of the graphs, how many leaked + objects of each type to graph, and so forth are currently set as constants + in ``monitoring_utils.utils``. The graphs are saved as GraphViz .dot + files in the directory specified by the ``MEMORY_GRAPH_DIRECTORY`` Django + setting. These can be viewed directly using xdot (Linux) or ZGRViewer + (macOS), or converted to a standard image format such as PNG or SVG using + GraphViz. + """ + graphs_are_enabled = False + tables_are_enabled = False + table_buffer = None + view_name = None + gunicorn_is_patched = False + + @classmethod + def start_counting(cls): + """ + Prepare to collect memory usage data for a new request. + """ + if cls._is_switch_enabled(u'log_memory_tables'): + if not cls.gunicorn_is_patched: + cls._patch_gunicorn() + cls.tables_are_enabled = True + cls.table_buffer = StringIO() + cls.graphs_are_enabled = cls._is_switch_enabled(u'create_memory_graphs') + cls._set_memory_leak_baseline() + + @classmethod + def analyze(cls, request): + """ + Call this anywhere in a view to record memory usage data at the end + of the request. + """ + cls.view_name = request.resolver_match.view_name + + @classmethod + def stop_counting(cls): + """ + Stop collecting memory usage data for the current request, and + generate any requested output for it. + """ + if cls.tables_are_enabled and cls.view_name: + if cls.graphs_are_enabled: + show_memory_leaks(cls.view_name, memory_table_buffer=cls.table_buffer) + else: + show_memory_leaks(cls.view_name, refs_depth=0, + back_refs_depth=0, memory_table_buffer=cls.table_buffer) + cls._reset() + + @classmethod + @lru_cache() + def graph_directory_path(cls): + """ + Get the default temporary directory for the current process in which + to store memory reference graphs. + """ + if settings.ROOT_URLCONF == 'lms.urls': + service = 'lms' + else: + service = 'cms' + return os.path.join(tempfile.mkdtemp(prefix='memory_graphs'), + '{service}_{pid}'.format(service=service, pid=os.getpid())) + + @staticmethod + def _is_switch_enabled(name): + return WaffleSwitchNamespace(name=WAFFLE_NAMESPACE).is_enabled(name) + + @classmethod + def _patch_gunicorn(cls): + """ + Patch gunicorn to record memory usage data when appropriate. Django's + ``request_finished`` signal and gunicorn's ``post_request`` hook + aren't called late enough to be useful for this; the response is still + in scope, so none of the objects attached to it can be garbage + collected yet. + """ + gunicorn.util.close = gunicorn_util_close + cls.gunicorn_is_patched = True + + @classmethod + def _reset(cls): + """ + Reset all the attributes to their default values in preparation for a + new request. + """ + cls.graphs_are_enabled = False + cls.tables_are_enabled = False + cls.table_buffer = None + cls.view_name = None + + @classmethod + def _set_memory_leak_baseline(cls): + """ + Reset the starting point from which the next call to + ``objgraph.get_new_ids()`` will count newly created objects. + """ + with open(os.devnull, 'w') as devnull: + get_new_ids(output=devnull) + + +def gunicorn_util_close(sock): + """ + Replacement for gunicorn.util.close() which does memory usage analysis if + the relevant Waffle switch was active at the start of the request. + + This monkeypatch is appropriate for gunicorn==0.17.4 and should be updated + as needed when upgrading gunicorn. + """ + try: + sock.close() + except socket.error: + pass + MemoryUsageData.stop_counting() + + +class WSGIServer(DjangoWSGIServer): + """ + A WSGI server to be used by Django's runserver management command so that + memory usage can be analyzed after the response is garbage collected. + Specified by the ``WSGI_SERVER`` Django setting in the devstack settings + files. + """ + def close_request(self, request): + MemoryUsageData.stop_counting() + + +# The following is copied and modified from objgraph, since it doesn't yet +# provide good hooks for customizing this operation + +def get_new_ids(skip_update=False, limit=10, sortby='deltas', # pylint: disable=dangerous-default-value + shortnames=None, ignore_thresholds=IGNORE_THRESHOLDS, + output=None, _state={}): + """Find and display new objects allocated since last call. + + Shows the increase in object counts since last call to this + function and returns the memory address ids for new objects. + + Returns a dictionary mapping object type names to sets of object IDs + that have been created since the last time this function was called. + + ``skip_update`` (bool): If True, returns the same dictionary that + was returned during the previous call without updating the internal + state or examining the objects currently in memory. + + ``limit`` (int): The maximum number of rows that you want to print + data for. Use 0 to suppress the printing. Use None to print everything. + + ``sortby`` (str): This is the column that you want to sort by in + descending order. Possible values are: 'old', 'current', 'new', + 'deltas' + + ``shortnames`` (bool): If True, classes with the same name but + defined in different modules will be lumped together. If False, + all type names will be qualified with the module name. If None (default), + ``get_new_ids`` will remember the value from previous calls, so it's + enough to prime this once. By default the primed value is True. + + ``_state`` (dict): Stores old, current, and new_ids in memory. + It is used by the function to store the internal state between calls. + Never pass in this argument unless you know what you're doing. + + The caveats documented in :func:`growth` apply. + + When one gets new_ids from :func:`get_new_ids`, one can use + :func:`at_addrs` to get a list of those objects. Then one can iterate over + the new objects, print out what they are, and call :func:`show_backrefs` or + :func:`show_chain` to see where they are referenced. + + Example: + + >>> _ = get_new_ids() # store current objects in _state + >>> _ = get_new_ids() # current_ids become old_ids in _state + >>> a = [0, 1, 2] # list we don't know about + >>> b = [3, 4, 5] # list we don't know about + >>> new_ids = get_new_ids(limit=3) # we see new lists + ====================================================================== + Type Old_ids Current_ids New_ids Count_Deltas + ====================================================================== + list 324 326 +3 +2 + dict 1125 1125 +0 +0 + wrapper_descriptor 1001 1001 +0 +0 + ====================================================================== + >>> new_lists = at_addrs(new_ids['list']) + >>> a in new_lists + True + >>> b in new_lists + True + """ + if ignore_thresholds is None: + ignore_thresholds = IGNORE_THRESHOLDS + _initialize_state(_state) + new_ids = _state['new'] + if skip_update: + return new_ids + old_ids = _state['old'] + current_ids = _state['current'] + if shortnames is None: + shortnames = _state['shortnames'] + else: + _state['shortnames'] = shortnames + gc.collect() + objects = gc.get_objects() + for class_name in old_ids: + old_ids[class_name].clear() + for class_name, ids_set in current_ids.items(): + old_ids[class_name].update(ids_set) + for class_name in current_ids: + current_ids[class_name].clear() + for o in objects: + if shortnames: + class_name = _short_typename(o) + else: + class_name = _long_typename(o) + id_number = id(o) + current_ids[class_name].add(id_number) + for class_name in new_ids: + new_ids[class_name].clear() + rows = [] + keys_to_remove = [] + for class_name in current_ids: + num_old = len(old_ids[class_name]) + num_current = len(current_ids[class_name]) + if num_old == 0 and num_current == 0: + # remove the key from our dicts if we don't have any old or + # current class_name objects + keys_to_remove.append(class_name) + continue + new_ids_set = current_ids[class_name] - old_ids[class_name] + new_ids[class_name].update(new_ids_set) + num_new = len(new_ids_set) + num_delta = num_current - num_old + if num_delta < 1 or (class_name in ignore_thresholds and num_current < ignore_thresholds[class_name]): + # ignore types with no net increase or whose overall count isn't large enough to worry us + if class_name in new_ids: + del new_ids[class_name] + continue + row = (class_name, num_old, num_current, num_new, num_delta) + rows.append(row) + for key in keys_to_remove: + del old_ids[key] + del current_ids[key] + if key in new_ids: + del new_ids[key] + index_by_sortby = {'old': 1, 'current': 2, 'new': 3, 'deltas': 4} + rows.sort(key=operator.itemgetter(index_by_sortby[sortby], 0), + reverse=True) + _show_results(rows, limit, output) + return new_ids + + +def _initialize_state(state): + """ + Initialize the object ID tracking data if it hasn't been done yet. + """ + if not state: + state['old'] = defaultdict(set) + state['current'] = defaultdict(set) + state['new'] = defaultdict(set) + state['shortnames'] = True + + +def _show_results(rows, limit, output): + """ + Show information about the memory leaked since the previous call to + ``get_new_ids()``. + + Args: + rows (list): The data rows to be displayed (if any) + limit (int): The max number of rows to display + output (stream): The output stream to send results to + """ + if output is None: + output = sys.stdout + if not rows: + _show_no_leaks_message(output) + else: + _show_leaks_table(rows, limit, output) + + +def _show_no_leaks_message(output): + """ + Print a message, indicating that no memory leaks were found, to the given + output stream. + """ + print('=' * 51, file=output) + print('No object types increased their net count in memory', file=output) + print('=' * 51, file=output) + + +def _show_leaks_table(rows, limit, output): + """ + Print a summary table of the leaked objects to the given output stream. + """ + if limit is not None: + rows = rows[:limit] + width = max(len(row[0]) for row in rows) + print('=' * (width + 13 * 4), file=output) + print('%-*s%13s%13s%13s%13s' % + (width, 'Type', 'Old_ids', 'Current_ids', 'New_ids', 'Count_Deltas'), + file=output) + print('=' * (width + 13 * 4), file=output) + for row_class, old, current, new, delta in rows: + print('%-*s%13d%13d%+13d%+13d' % + (width, row_class, old, current, new, delta), file=output) + print('=' * (width + 13 * 4), file=output) diff --git a/openedx/tests/settings.py b/openedx/tests/settings.py index 9b3691d2301..24167b3c145 100644 --- a/openedx/tests/settings.py +++ b/openedx/tests/settings.py @@ -83,6 +83,7 @@ INSTALLED_APPS = ( LMS_ROOT_URL = 'http://localhost:8000' MEDIA_ROOT = tempfile.mkdtemp() +MEMORY_GRAPH_DIRECTORY = tempfile.mkdtemp(prefix='memory_graphs') MICROSITE_BACKEND = 'microsite_configuration.backends.filebased.FilebasedMicrositeBackend' MICROSITE_TEMPLATE_BACKEND = 'microsite_configuration.backends.filebased.FilebasedMicrositeTemplateBackend' diff --git a/requirements/edx/base.txt b/requirements/edx/base.txt index 0f36dbeca75..9826923f948 100644 --- a/requirements/edx/base.txt +++ b/requirements/edx/base.txt @@ -75,6 +75,7 @@ fs-s3fs==0.1.5 futures==3.2.0 ; python_version == "2.7" GitPython==0.3.2.RC1 glob2==0.3 +graphviz==0.8.2 gunicorn==0.17.4 help-tokens==1.0.3 httpretty==0.8.14 @@ -84,6 +85,7 @@ mako==1.0.2 Markdown>=2.6,<2.7 mongoengine==0.10.0 MySQL-python==1.2.5 +objgraph==3.4.0 networkx==1.7 nltk==3.2.5 nose-xunitmp==0.3.2 -- GitLab