Replaced troublesome pyes integration with direct calls made to elasticsearch rest api

3b165da1 · Slater-Victoroff · d1be90bb · 3b165da1 · 3b165da1 · 3b165da1
Commit 3b165da1 authored 11 years ago by Slater-Victoroff
--- a/common/djangoapps/search/analyzer.json
+++ b/common/djangoapps/search/analyzer.json
+{
+"analyzer": {
+
+    "transcript_analyzer": {
+        "type": "custom",
+        "tokenizer": "standard",
+        "filter": ["protected", "asciifolding", "custom_word_delimiter", "lowercase", "custom_stemmer", "shingle"],
+        "char_filter": ["custom_mapping"]
+    }
+},
+
+"filter" : {
+
+    "custom_word_delimiter":{
+        "type": "word_delimiter",
+        "preserve_original": "true"
+    },
+
+    "custom_stemmer": {
+        "type": "stemmer",
+        "name": "english"
+    }, 
+
+    "protected": {
+        "type": "keyword_marker",
+        "keywords_path": "protectedWords.txt"
+    }
+},
+
+"char_filter": {
+    "custom_mapping": {
+        "type": "mapping",
+        "mappings": ["\n=>-"]
+    }
+}
+}
\ No newline at end of file
--- a/common/djangoapps/search/es_requests.py
+++ b/common/djangoapps/search/es_requests.py
+import requests
+import json
+
+
+class ElasticDatabase:
+
+    def __init__(self, url, index_settings_file, *args):
+        """
+        Will initialize elastic search object with any indices specified by args
+
+        specifically the url should be something of the form `http://localhost:9200`
+        importantly do not include a slash at the end of the url name.
+
+        args should be a list of dictionaries, each dictionary specifying a JSON mapping
+        to be used for a specific type.
+
+        Example Dictionary:
+            {"index": "transcript", "type": "6-002x", "mapping":
+                {
+                "properties" : {
+                    "searchable_text": {
+                        "type": "string",
+                        "store": "yes",
+                        "index": "analyzed"
+                       }
+                    }
+                }
+            }
+
+        Eventually we will support different configuration files for different indices, but
+        since this is only indexing transcripts right now it seems excessive"""
+
+        self.url = url
+        self.args = args
+        self.index_settings = open(index_settings_file, 'rb').read()
+
+    def parse_args(self):
+        for mapping in self.args:
+            try:
+                json_mapping = json.loads(mapping)
+            except ValueError:
+                print "Badly formed JSON args, please check your mappings file"
+                break
+
+            try:
+                index = json_mapping['index']
+                type_ = json_mapping['type']
+                mapping = json_mapping['mapping']
+                self.setup_index(index)
+                self.setup_type(index, type_, mapping)
+            except KeyError:
+                print "Could not find needed keys. Keys found: "
+                print mapping.keys()
+                continue
+
+    def setup_type(self, index, type_, json_mapping):
+        """
+        json_mapping should be a dictionary starting at the properties level of a mapping.
+
+        The type level will be added, so if you include it things will break. The purpose of this
+        is to encourage loose coupling between types and mappings for better code
+        """
+
+        full_url = "/".join([self.url, index, type_, "_mapping"])
+        json_put_body = {type_: json_mapping}
+        requests.put(full_url, data=json_put_body)
+
+    def has_index(self, index):
+        """Checks to see if a given index exists in the database returns existance boolean,
+
+        If this returns something other than a 200 or a 404 something is wrong and so we error"""
+        full_url = "/".join([self.url, index])
+        status = requests.head(full_url).status_code
+        if status == 200:
+            return True
+        if status == 404:
+            return False
+        else:
+            print "Got an unexpected reponse code: " + str(status)
+            raise
+
+    def setup_index(self, index):
+        """Creates a new elasticsearch index, returns the response it gets"""
+        full_url = "/".join(self.url, index) + "/"
+        return requests.put(full_url, data=self.index_settings)
+
+    def index_data(self, index, type_, id_, data):
+        """Data should be passed in as a dictionary, assumes it matches the given mapping"""
+        full_url = "/".join([self.url, index, type_, id_])
+        response = requests.put(full_url, json.dumps(data))
+        return json.loads(response)['ok']
+
+    def get_index_settings(self, index):
+        """Returns the current settings of """
+        full_url = "/".join([self.url, index, "_settings"])
+        return json.loads(requests.get(full_url)._content)
+
+    def get_type_mapping(self, index, type_):
+        full_url = "/".join([self.url, index, type_, "_mapping"])
+        return json.loads(requests.get(full_url)._content)
+
+    def index_data(self, index, type_, id_, json_data):
+        full_url = "/".join([self.url, index, type_, id_])
+        requests.put(full_url, data=json_data)
--- a/common/djangoapps/search/mapping.json
+++ b/common/djangoapps/search/mapping.json
@@ -5,20 +5,7 @@
 		"index": "analyzed",
 		"store": "yes",
 		"type": "string",
-		"term_vector": "with_positions_offsets"
-	},
-
-	"phonetic_text": {
-		"boost": 1.0,
-		"index": "analyzed",
-		"store": "yes",
-		"type": "string",
-		"term_vector": "with_positions_offsets"
-	},
-
-	"uuid": {
-		"index": "not_analyzed",
-		"store": "yes",
-		"type": "string"
+		"term_vector": "with_positions_offsets",
+        "analyzer": "transcript_analyzer"
 	}
 }
\ No newline at end of file
--- a/common/djangoapps/search/protectedWords.txt
+++ b/common/djangoapps/search/protectedWords.txt
+"gauss",
+"stokes",
+"navier",
+"einstein",
+"goddard",
+"oppenheimer",
+"bloch",
+"hawkings",
+"newton",
+"bohr",
+"darwin",
+"planck",
+"rontgen",
+"tesla",
+"franklin"
\ No newline at end of file
--- a/common/djangoapps/search/settings.json
+++ b/common/djangoapps/search/settings.json
+{
+	"settings": {
+		"index": {
+			"number_of_replicas": 2,
+			"number_of_shards": 3
+		}
+	}
+}
\ No newline at end of file