From a802e5a21dd521257518b0dc83d4458245c4bed7 Mon Sep 17 00:00:00 2001
From: Jeffrey Lovitz <jeffrey.lovitz@gmail.com>
Date: Wed, 18 Dec 2019 14:53:19 -0500
Subject: [PATCH 01/15] Change directory structure

---
 bulk_insert/__init__.py                      |  0
 bulk_insert.py => bulk_insert/bulk_insert.py |  0
 setup.py                                     | 17 +++++++++++++++++
 3 files changed, 17 insertions(+)
 create mode 100644 bulk_insert/__init__.py
 rename bulk_insert.py => bulk_insert/bulk_insert.py (100%)
 create mode 100644 setup.py

diff --git a/bulk_insert/__init__.py b/bulk_insert/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/bulk_insert.py b/bulk_insert/bulk_insert.py
similarity index 100%
rename from bulk_insert.py
rename to bulk_insert/bulk_insert.py
diff --git a/setup.py b/setup.py
new file mode 100644
index 0000000..1acf928
--- /dev/null
+++ b/setup.py
@@ -0,0 +1,17 @@
+
+from setuptools import setup, find_packages
+setup(
+    name='redisgraph-bulk-loader',
+    version='0.9dev',
+
+    description='RedisGraph Bulk Import Tool',
+    url='https://github.com/redisgraph/redisgraph-bulk-loader',
+    packages=find_packages(),
+    install_requires=['redis', 'click'],
+    classifiers=[
+        'Intended Audience :: Developers',
+        'License :: OSI Approved :: BSD License',
+        'Programming Language :: Python :: 3.0',
+        'Topic :: Database'
+    ]
+)

From 6e8e44197ea9227de68daea50bcf0378876cad21 Mon Sep 17 00:00:00 2001
From: Jeffrey Lovitz <jeffrey.lovitz@gmail.com>
Date: Wed, 18 Dec 2019 15:30:34 -0500
Subject: [PATCH 02/15] WIP

---
 .gitignore                   |  30 +++
 bulk_insert/__init__.py      |   5 +
 bulk_insert/bulk_insert.py   | 353 ++---------------------------------
 bulk_insert/configs.py       |  16 ++
 bulk_insert/entity_file.py   | 122 ++++++++++++
 bulk_insert/label.py         |  55 ++++++
 bulk_insert/module_vars.py   |  10 +
 bulk_insert/query_buffer.py  |  56 ++++++
 bulk_insert/relation_type.py |  60 ++++++
 9 files changed, 373 insertions(+), 334 deletions(-)
 create mode 100644 bulk_insert/configs.py
 create mode 100644 bulk_insert/entity_file.py
 create mode 100644 bulk_insert/label.py
 create mode 100644 bulk_insert/module_vars.py
 create mode 100644 bulk_insert/query_buffer.py
 create mode 100644 bulk_insert/relation_type.py

diff --git a/.gitignore b/.gitignore
index 722d5e7..89c4502 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1 +1,31 @@
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+
+# Distribution / packaging
+.Python
+env/
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
 .vscode
+
diff --git a/bulk_insert/__init__.py b/bulk_insert/__init__.py
index e69de29..989c1b9 100644
--- a/bulk_insert/__init__.py
+++ b/bulk_insert/__init__.py
@@ -0,0 +1,5 @@
+from .configs import Configs
+from .label import Label
+from .relation_type import RelationType
+
+from .query_buffer import QueryBuffer
diff --git a/bulk_insert/bulk_insert.py b/bulk_insert/bulk_insert.py
index 10b4786..6155884 100644
--- a/bulk_insert/bulk_insert.py
+++ b/bulk_insert/bulk_insert.py
@@ -8,335 +8,27 @@
 from timeit import default_timer as timer
 import redis
 import click
-
-# Global variables
-CONFIGS = None         # thresholds for batching Redis queries
-NODE_DICT = {}         # global node dictionary
-TOP_NODE_ID = 0        # next ID to assign to a node
-QUERY_BUF = None       # Buffer for query being constructed
-QUOTING = None
-
-FIELD_TYPES = None
-
-# Custom error class for invalid inputs
-class CSVError(Exception):
-    pass
-
-# Official enum support varies widely between 2.7 and 3.x, so we'll use a custom class
-class Type:
-    NULL = 0
-    BOOL = 1
-    NUMERIC = 2
-    STRING = 3
-
-# User-configurable thresholds for when to send queries to Redis
-class Configs(object):
-    def __init__(self, max_token_count, max_buffer_size, max_token_size, skip_invalid_nodes, skip_invalid_edges):
-        # Maximum number of tokens per query
-        # 1024 * 1024 is the hard-coded Redis maximum. We'll set a slightly lower limit so
-        # that we can safely ignore tokens that aren't binary strings
-        # ("GRAPH.BULK", "BEGIN", graph name, counts)
-        self.max_token_count = min(max_token_count, 1024 * 1023)
-        # Maximum size in bytes per query
-        self.max_buffer_size = max_buffer_size * 1000000
-        # Maximum size in bytes per token
-        # 512 megabytes is a hard-coded Redis maximum
-        self.max_token_size = min(max_token_size * 1000000, 512 * 1000000)
-
-        self.skip_invalid_nodes = skip_invalid_nodes
-        self.skip_invalid_edges = skip_invalid_edges
-
-# QueryBuffer is the class that processes input CSVs and emits their binary formats to the Redis client.
-class QueryBuffer(object):
-    def __init__(self, graphname, client):
-        # Redis client and data for each query
-        self.client = client
-
-        # Sizes for buffer currently being constructed
-        self.redis_token_count = 0
-        self.buffer_size = 0
-
-        # The first query should include a "BEGIN" token
-        self.graphname = graphname
-        self.initial_query = True
-
-        self.node_count = 0
-        self.relation_count = 0
-
-        self.labels = [] # List containing all pending Label objects
-        self.reltypes = [] # List containing all pending RelationType objects
-
-        self.nodes_created = 0 # Total number of nodes created
-        self.relations_created = 0 # Total number of relations created
-
-    # Send all pending inserts to Redis
-    def send_buffer(self):
-        # Do nothing if we have no entities
-        if self.node_count == 0 and self.relation_count == 0:
-            return
-
-        args = [self.node_count, self.relation_count, len(self.labels), len(self.reltypes)] + self.labels + self.reltypes
-        # Prepend a "BEGIN" token if this is the first query
-        if self.initial_query:
-            args.insert(0, "BEGIN")
-            self.initial_query = False
-
-        result = self.client.execute_command("GRAPH.BULK", self.graphname, *args)
-        stats = result.split(', '.encode())
-        self.nodes_created += int(stats[0].split(' '.encode())[0])
-        self.relations_created += int(stats[1].split(' '.encode())[0])
-
-        self.clear_buffer()
-
-    # Delete all entities that have been inserted
-    def clear_buffer(self):
-        self.redis_token_count = 0
-        self.buffer_size = 0
-
-        # All constructed entities have been inserted, so clear buffers
-        self.node_count = 0
-        self.relation_count = 0
-        del self.labels[:]
-        del self.reltypes[:]
-
-    def report_completion(self, runtime):
-        print("Construction of graph '%s' complete: %d nodes created, %d relations created in %f seconds"
-              % (self.graphname, self.nodes_created, self.relations_created, runtime))
-
-# Superclass for label and relation CSV files
-class EntityFile(object):
-    def __init__(self, filename, separator):
-        # The label or relation type string is the basename of the file
-        self.entity_str = os.path.splitext(os.path.basename(filename))[0]
-        # Input file handling
-        self.infile = io.open(filename, 'rt')
-        # Initialize CSV reader that ignores leading whitespace in each field
-        # and does not modify input quote characters
-        self.reader = csv.reader(self.infile, delimiter=separator, skipinitialspace=True, quoting=QUOTING)
-
-        self.prop_offset = 0 # Starting index of properties in row
-        self.prop_count = 0 # Number of properties per entity
-
-        self.packed_header = b''
-        self.binary_entities = []
-        self.binary_size = 0 # size of binary token
-        self.count_entities() # number of entities/row in file.
-
-    # Count number of rows in file.
-    def count_entities(self):
-        self.entities_count = 0
-        self.entities_count = sum(1 for line in self.infile)
-        # discard header row
-        self.entities_count -= 1
-        # seek back
-        self.infile.seek(0)
-        return self.entities_count
-
-    # Simple input validations for each row of a CSV file
-    def validate_row(self, expected_col_count, row):
-        # Each row should have the same number of fields
-        if len(row) != expected_col_count:
-            raise CSVError("%s:%d Expected %d columns, encountered %d ('%s')"
-                           % (self.infile.name, self.reader.line_num, expected_col_count, len(row), ','.join(row)))
-
-    # If part of a CSV file was sent to Redis, delete the processed entities and update the binary size
-    def reset_partial_binary(self):
-        self.binary_entities = []
-        self.binary_size = len(self.packed_header)
-
-    # Convert property keys from a CSV file header into a binary string
-    def pack_header(self, header):
-        prop_count = len(header) - self.prop_offset
-        # String format
-        entity_bytes = self.entity_str.encode()
-        fmt = "=%dsI" % (len(entity_bytes) + 1) # Unaligned native, entity name, count of properties
-        args = [entity_bytes, prop_count]
-        for p in header[self.prop_offset:]:
-            prop = p.encode()
-            fmt += "%ds" % (len(prop) + 1) # encode string with a null terminator
-            args.append(prop)
-        return struct.pack(fmt, *args)
-
-    # Convert a list of properties into a binary string
-    def pack_props(self, line):
-        props = []
-        for num, field in enumerate(line[self.prop_offset:]):
-            field_type_idx = self.prop_offset+num
-            try:
-                FIELD_TYPES[self.entity_str][field_type_idx]
-            except:
-                props.append(prop_to_binary(field, None))
-            else:
-                props.append(prop_to_binary(field, FIELD_TYPES[self.entity_str][field_type_idx]))
-        return b''.join(p for p in props)
-
-    def to_binary(self):
-        return self.packed_header + b''.join(self.binary_entities)
-
-# Handler class for processing label csv files.
-class Label(EntityFile):
-    def __init__(self, infile, separator):
-        super(Label, self).__init__(infile, separator)
-        expected_col_count = self.process_header()
-        self.process_entities(expected_col_count)
-        self.infile.close()
-
-    def process_header(self):
-        # Header format:
-        # node identifier (which may be a property key), then all other property keys
-        header = next(self.reader)
-        expected_col_count = len(header)
-        # If identifier field begins with an underscore, don't add it as a property.
-        if header[0][0] == '_':
-            self.prop_offset = 1
-        self.packed_header = self.pack_header(header)
-        self.binary_size += len(self.packed_header)
-        return expected_col_count
-
-    def process_entities(self, expected_col_count):
-        global NODE_DICT
-        global TOP_NODE_ID
-        global QUERY_BUF
-
-        entities_created = 0
-        with click.progressbar(self.reader, length=self.entities_count, label=self.entity_str) as reader:
-            for row in reader:
-                self.validate_row(expected_col_count, row)
-                # Add identifier->ID pair to dictionary if we are building relations
-                if NODE_DICT is not None:
-                    if row[0] in NODE_DICT:
-                        sys.stderr.write("Node identifier '%s' was used multiple times - second occurrence at %s:%d\n"
-                                         % (row[0], self.infile.name, self.reader.line_num))
-                        if CONFIGS.skip_invalid_nodes is False:
-                            exit(1)
-                    NODE_DICT[row[0]] = TOP_NODE_ID
-                    TOP_NODE_ID += 1
-                row_binary = self.pack_props(row)
-                row_binary_len = len(row_binary)
-                # If the addition of this entity will make the binary token grow too large,
-                # send the buffer now.
-                if self.binary_size + row_binary_len > CONFIGS.max_token_size:
-                    QUERY_BUF.labels.append(self.to_binary())
-                    QUERY_BUF.send_buffer()
-                    self.reset_partial_binary()
-                    # Push the label onto the query buffer again, as there are more entities to process.
-                    QUERY_BUF.labels.append(self.to_binary())
-
-                QUERY_BUF.node_count += 1
-                entities_created += 1
-                self.binary_size += row_binary_len
-                self.binary_entities.append(row_binary)
-            QUERY_BUF.labels.append(self.to_binary())
-        print("%d nodes created with label '%s'" % (entities_created, self.entity_str))
-
-# Handler class for processing relation csv files.
-class RelationType(EntityFile):
-    def __init__(self, infile, separator):
-        super(RelationType, self).__init__(infile, separator)
-        expected_col_count = self.process_header()
-        self.process_entities(expected_col_count)
-        self.infile.close()
-
-    def process_header(self):
-        # Header format:
-        # source identifier, dest identifier, properties[0..n]
-        header = next(self.reader)
-        # Assume rectangular CSVs
-        expected_col_count = len(header)
-        self.prop_count = expected_col_count - 2
-        if self.prop_count < 0:
-            raise CSVError("Relation file '%s' should have at least 2 elements in header line."
-                           % (self.infile.name))
-
-        self.prop_offset = 2
-        self.packed_header = self.pack_header(header) # skip src and dest identifiers
-        self.binary_size += len(self.packed_header)
-        return expected_col_count
-
-    def process_entities(self, expected_col_count):
-        entities_created = 0
-        with click.progressbar(self.reader, length=self.entities_count, label=self.entity_str) as reader:
-            for row in reader:
-                self.validate_row(expected_col_count, row)
-                try:
-                    src = NODE_DICT[row[0]]
-                    dest = NODE_DICT[row[1]]
-                except KeyError as e:
-                    print("Relationship specified a non-existent identifier. src: %s; dest: %s" % (row[0], row[1]))
-                    if CONFIGS.skip_invalid_edges is False:
-                        raise e
-                    continue
-                fmt = "=QQ" # 8-byte unsigned ints for src and dest
-                row_binary = struct.pack(fmt, src, dest) + self.pack_props(row)
-                row_binary_len = len(row_binary)
-                # If the addition of this entity will make the binary token grow too large,
-                # send the buffer now.
-                if self.binary_size + row_binary_len > CONFIGS.max_token_size:
-                    QUERY_BUF.reltypes.append(self.to_binary())
-                    QUERY_BUF.send_buffer()
-                    self.reset_partial_binary()
-                    # Push the reltype onto the query buffer again, as there are more entities to process.
-                    QUERY_BUF.reltypes.append(self.to_binary())
-
-                QUERY_BUF.relation_count += 1
-                entities_created += 1
-                self.binary_size += row_binary_len
-                self.binary_entities.append(row_binary)
-            QUERY_BUF.reltypes.append(self.to_binary())
-        print("%d relations created for type '%s'" % (entities_created, self.entity_str))
-
-# Convert a single CSV property field into a binary stream.
-# Supported property types are string, numeric, boolean, and NULL.
-# type is either Type.NUMERIC, Type.BOOL or Type.STRING, and explicitly sets the value to this type if possible
-def prop_to_binary(prop_val, type):
-    # All format strings start with an unsigned char to represent our Type enum
-    format_str = "=B"
-    if prop_val is None:
-        # An empty field indicates a NULL property
-        return struct.pack(format_str, Type.NULL)
-
-    # If field can be cast to a float, allow it
-    if type == None or type == Type.NUMERIC:
-        try:
-            numeric_prop = float(prop_val)
-            if not math.isnan(numeric_prop) and not math.isinf(numeric_prop): # Don't accept non-finite values.
-                return struct.pack(format_str + "d", Type.NUMERIC, numeric_prop)
-        except:
-            pass
-
-    if type == None or type == Type.BOOL:
-        # If field is 'false' or 'true', it is a boolean
-        if prop_val.lower() == 'false':
-            return struct.pack(format_str + '?', Type.BOOL, False)
-        elif prop_val.lower() == 'true':
-            return struct.pack(format_str + '?', Type.BOOL, True)
-
-    if type == None or type == Type.STRING:
-        # If we've reached this point, the property is a string
-        encoded_str = str.encode(prop_val) # struct.pack requires bytes objects as arguments
-        # Encoding len+1 adds a null terminator to the string
-        format_str += "%ds" % (len(encoded_str) + 1)
-        return struct.pack(format_str, Type.STRING, encoded_str)
-
-    ## if it hasn't returned by this point, it is trying to set it to a type that it can't adopt
-    raise Exception("unable to parse [" + prop_val + "] with type ["+repr(type)+"]")
+from configs import Configs
+from query_buffer import QueryBuffer
+from label import Label
+from relation_type import RelationType 
+import module_vars
 
 # For each node input file, validate contents and convert to binary format.
 # If any buffer limits have been reached, flush all enqueued inserts to Redis.
 def process_entity_csvs(cls, csvs, separator):
-    global QUERY_BUF
     for in_csv in csvs:
         # Build entity descriptor from input CSV
         entity = cls(in_csv, separator)
         added_size = entity.binary_size
         # Check to see if the addition of this data will exceed the buffer's capacity
-        if (QUERY_BUF.buffer_size + added_size >= CONFIGS.max_buffer_size
-                or QUERY_BUF.redis_token_count + len(entity.binary_entities) >= CONFIGS.max_token_count):
+        if (module_vars.QUERY_BUF.buffer_size + added_size >= module_vars.CONFIGS.max_buffer_size
+                or module_vars.QUERY_BUF.redis_token_count + len(entity.binary_entities) >= module_vars.CONFIGS.max_token_count):
             # Send and flush the buffer if appropriate
-            QUERY_BUF.send_buffer()
+            module_vars.QUERY_BUF.send_buffer()
         # Add binary data to list and update all counts
-        QUERY_BUF.redis_token_count += len(entity.binary_entities)
-        QUERY_BUF.buffer_size += added_size
+        module_vars.QUERY_BUF.redis_token_count += len(entity.binary_entities)
+        module_vars.QUERY_BUF.buffer_size += added_size
 
 # Command-line arguments
 @click.command()
@@ -360,26 +52,19 @@ def process_entity_csvs(cls, csvs, separator):
 
 
 def bulk_insert(graph, host, port, password, nodes, relations, separator, max_token_count, max_buffer_size, max_token_size, quote, field_types, skip_invalid_nodes, skip_invalid_edges):
-    global CONFIGS
-    global NODE_DICT
-    global TOP_NODE_ID
-    global QUERY_BUF
-    global QUOTING
-    global FIELD_TYPES
-
     if sys.version_info[0] < 3:
         raise Exception("Python 3 is required for the RedisGraph bulk loader.")
 
     if field_types is not None:
         try:
-            FIELD_TYPES = json.loads(field_types)
+            module_vars.FIELD_TYPES = json.loads(field_types)
         except:
             raise Exception("Problem parsing field-types. Use the format {<label>:[<col1 type>, <col2 type> ...]} where type can be 0(null),1(bool),2(numeric),3(string) ")
 
-    QUOTING = int(quote)
+    module_vars.QUOTING = int(quote)
 
-    TOP_NODE_ID = 0 # reset global ID variable (in case we are calling bulk_insert from unit tests)
-    CONFIGS = Configs(max_token_count, max_buffer_size, max_token_size, skip_invalid_nodes, skip_invalid_edges)
+    module_vars.TOP_NODE_ID = 0 # reset global ID variable (in case we are calling bulk_insert from unit tests)
+    module_vars.CONFIGS = Configs(max_token_count, max_buffer_size, max_token_size, skip_invalid_nodes, skip_invalid_edges)
 
     start_time = timer()
     # Attempt to connect to Redis server
@@ -405,13 +90,13 @@ def bulk_insert(graph, host, port, password, nodes, relations, separator, max_to
         print("Graph with name '%s', could not be created, as Redis key '%s' already exists." % (graph, graph))
         exit(1)
 
-    QUERY_BUF = QueryBuffer(graph, client)
+    module_vars.QUERY_BUF = QueryBuffer(graph, client)
 
     # Create a node dictionary if we're building relations and as such require unique identifiers
     if relations:
-        NODE_DICT = {}
+        module_vars.NODE_DICT = {}
     else:
-        NODE_DICT = None
+        module_vars.NODE_DICT = None
 
     process_entity_csvs(Label, nodes, separator)
 
@@ -419,10 +104,10 @@ def bulk_insert(graph, host, port, password, nodes, relations, separator, max_to
         process_entity_csvs(RelationType, relations, separator)
 
     # Send all remaining tokens to Redis
-    QUERY_BUF.send_buffer()
+    module_vars.QUERY_BUF.send_buffer()
 
     end_time = timer()
-    QUERY_BUF.report_completion(end_time - start_time)
+    module_vars.QUERY_BUF.report_completion(end_time - start_time)
 
 if __name__ == '__main__':
     bulk_insert()
diff --git a/bulk_insert/configs.py b/bulk_insert/configs.py
new file mode 100644
index 0000000..fc0028d
--- /dev/null
+++ b/bulk_insert/configs.py
@@ -0,0 +1,16 @@
+# User-configurable thresholds for when to send queries to Redis
+class Configs(object):
+    def __init__(self, max_token_count, max_buffer_size, max_token_size, skip_invalid_nodes, skip_invalid_edges):
+        # Maximum number of tokens per query
+        # 1024 * 1024 is the hard-coded Redis maximum. We'll set a slightly lower limit so
+        # that we can safely ignore tokens that aren't binary strings
+        # ("GRAPH.BULK", "BEGIN", graph name, counts)
+        self.max_token_count = min(max_token_count, 1024 * 1023)
+        # Maximum size in bytes per query
+        self.max_buffer_size = max_buffer_size * 1000000
+        # Maximum size in bytes per token
+        # 512 megabytes is a hard-coded Redis maximum
+        self.max_token_size = min(max_token_size * 1000000, 512 * 1000000)
+
+        self.skip_invalid_nodes = skip_invalid_nodes
+        self.skip_invalid_edges = skip_invalid_edges
diff --git a/bulk_insert/entity_file.py b/bulk_insert/entity_file.py
new file mode 100644
index 0000000..c20af24
--- /dev/null
+++ b/bulk_insert/entity_file.py
@@ -0,0 +1,122 @@
+import os
+import io
+import csv
+import struct
+import module_vars
+
+# Custom error class for invalid inputs
+class CSVError(Exception):
+    pass
+
+# Official enum support varies widely between 2.7 and 3.x, so we'll use a custom class
+class Type:
+    NULL = 0
+    BOOL = 1
+    NUMERIC = 2
+    STRING = 3
+
+# Convert a single CSV property field into a binary stream.
+# Supported property types are string, numeric, boolean, and NULL.
+# type is either Type.NUMERIC, Type.BOOL or Type.STRING, and explicitly sets the value to this type if possible
+def prop_to_binary(prop_val, type):
+    # All format strings start with an unsigned char to represent our Type enum
+    format_str = "=B"
+    if prop_val is None:
+        # An empty field indicates a NULL property
+        return struct.pack(format_str, Type.NULL)
+
+    # If field can be cast to a float, allow it
+    if type == None or type == Type.NUMERIC:
+        try:
+            numeric_prop = float(prop_val)
+            if not math.isnan(numeric_prop) and not math.isinf(numeric_prop): # Don't accept non-finite values.
+                return struct.pack(format_str + "d", Type.NUMERIC, numeric_prop)
+        except:
+            pass
+
+    if type == None or type == Type.BOOL:
+        # If field is 'false' or 'true', it is a boolean
+        if prop_val.lower() == 'false':
+            return struct.pack(format_str + '?', Type.BOOL, False)
+        elif prop_val.lower() == 'true':
+            return struct.pack(format_str + '?', Type.BOOL, True)
+
+    if type == None or type == Type.STRING:
+        # If we've reached this point, the property is a string
+        encoded_str = str.encode(prop_val) # struct.pack requires bytes objects as arguments
+        # Encoding len+1 adds a null terminator to the string
+        format_str += "%ds" % (len(encoded_str) + 1)
+        return struct.pack(format_str, Type.STRING, encoded_str)
+
+    ## if it hasn't returned by this point, it is trying to set it to a type that it can't adopt
+    raise Exception("unable to parse [" + prop_val + "] with type ["+repr(type)+"]")
+
+# Superclass for label and relation CSV files
+class EntityFile(object):
+    def __init__(self, filename, separator):
+        # The label or relation type string is the basename of the file
+        self.entity_str = os.path.splitext(os.path.basename(filename))[0]
+        # Input file handling
+        self.infile = io.open(filename, 'rt')
+        # Initialize CSV reader that ignores leading whitespace in each field
+        # and does not modify input quote characters
+        self.reader = csv.reader(self.infile, delimiter=separator, skipinitialspace=True, quoting=module_vars.QUOTING)
+
+        self.prop_offset = 0 # Starting index of properties in row
+        self.prop_count = 0 # Number of properties per entity
+
+        self.packed_header = b''
+        self.binary_entities = []
+        self.binary_size = 0 # size of binary token
+        self.count_entities() # number of entities/row in file.
+
+    # Count number of rows in file.
+    def count_entities(self):
+        self.entities_count = 0
+        self.entities_count = sum(1 for line in self.infile)
+        # discard header row
+        self.entities_count -= 1
+        # seek back
+        self.infile.seek(0)
+        return self.entities_count
+
+    # Simple input validations for each row of a CSV file
+    def validate_row(self, expected_col_count, row):
+        # Each row should have the same number of fields
+        if len(row) != expected_col_count:
+            raise CSVError("%s:%d Expected %d columns, encountered %d ('%s')"
+                           % (self.infile.name, self.reader.line_num, expected_col_count, len(row), ','.join(row)))
+
+    # If part of a CSV file was sent to Redis, delete the processed entities and update the binary size
+    def reset_partial_binary(self):
+        self.binary_entities = []
+        self.binary_size = len(self.packed_header)
+
+    # Convert property keys from a CSV file header into a binary string
+    def pack_header(self, header):
+        prop_count = len(header) - self.prop_offset
+        # String format
+        entity_bytes = self.entity_str.encode()
+        fmt = "=%dsI" % (len(entity_bytes) + 1) # Unaligned native, entity name, count of properties
+        args = [entity_bytes, prop_count]
+        for p in header[self.prop_offset:]:
+            prop = p.encode()
+            fmt += "%ds" % (len(prop) + 1) # encode string with a null terminator
+            args.append(prop)
+        return struct.pack(fmt, *args)
+
+    # Convert a list of properties into a binary string
+    def pack_props(self, line):
+        props = []
+        for num, field in enumerate(line[self.prop_offset:]):
+            field_type_idx = self.prop_offset+num
+            try:
+                module_vars.FIELD_TYPES[self.entity_str][field_type_idx]
+            except:
+                props.append(prop_to_binary(field, None))
+            else:
+                props.append(prop_to_binary(field, module_vars.FIELD_TYPES[self.entity_str][field_type_idx]))
+        return b''.join(p for p in props)
+
+    def to_binary(self):
+        return self.packed_header + b''.join(self.binary_entities)
diff --git a/bulk_insert/label.py b/bulk_insert/label.py
new file mode 100644
index 0000000..011ad07
--- /dev/null
+++ b/bulk_insert/label.py
@@ -0,0 +1,55 @@
+import click
+from entity_file import EntityFile
+import module_vars
+
+# Handler class for processing label csv files.
+class Label(EntityFile):
+    def __init__(self, infile, separator):
+        super(Label, self).__init__(infile, separator)
+        expected_col_count = self.process_header()
+        self.process_entities(expected_col_count)
+        self.infile.close()
+
+    def process_header(self):
+        # Header format:
+        # node identifier (which may be a property key), then all other property keys
+        header = next(self.reader)
+        expected_col_count = len(header)
+        # If identifier field begins with an underscore, don't add it as a property.
+        if header[0][0] == '_':
+            self.prop_offset = 1
+        self.packed_header = self.pack_header(header)
+        self.binary_size += len(self.packed_header)
+        return expected_col_count
+
+    def process_entities(self, expected_col_count):
+        entities_created = 0
+        with click.progressbar(self.reader, length=self.entities_count, label=self.entity_str) as reader:
+            for row in reader:
+                self.validate_row(expected_col_count, row)
+                # Add identifier->ID pair to dictionary if we are building relations
+                if module_vars.NODE_DICT is not None:
+                    if row[0] in module_vars.NODE_DICT:
+                        sys.stderr.write("Node identifier '%s' was used multiple times - second occurrence at %s:%d\n"
+                                         % (row[0], self.infile.name, self.reader.line_num))
+                        if module_vars.CONFIGS.skip_invalid_nodes is False:
+                            exit(1)
+                    module_vars.NODE_DICT[row[0]] = module_vars.TOP_NODE_ID
+                    module_vars.TOP_NODE_ID += 1
+                row_binary = self.pack_props(row)
+                row_binary_len = len(row_binary)
+                # If the addition of this entity will make the binary token grow too large,
+                # send the buffer now.
+                if self.binary_size + row_binary_len > module_vars.CONFIGS.max_token_size:
+                    module_vars.QUERY_BUF.labels.append(self.to_binary())
+                    module_vars.QUERY_BUF.send_buffer()
+                    self.reset_partial_binary()
+                    # Push the label onto the query buffer again, as there are more entities to process.
+                    module_vars.QUERY_BUF.labels.append(self.to_binary())
+
+                module_vars.QUERY_BUF.node_count += 1
+                entities_created += 1
+                self.binary_size += row_binary_len
+                self.binary_entities.append(row_binary)
+            module_vars.QUERY_BUF.labels.append(self.to_binary())
+        print("%d nodes created with label '%s'" % (entities_created, self.entity_str))
diff --git a/bulk_insert/module_vars.py b/bulk_insert/module_vars.py
new file mode 100644
index 0000000..28edb54
--- /dev/null
+++ b/bulk_insert/module_vars.py
@@ -0,0 +1,10 @@
+# TODO Get rid of this whole thing
+
+# Global variables
+CONFIGS = None         # thresholds for batching Redis queries
+NODE_DICT = {}         # global node dictionary
+TOP_NODE_ID = 0        # next ID to assign to a node
+QUERY_BUF = None       # Buffer for query being constructed
+QUOTING = None
+FIELD_TYPES = None
+
diff --git a/bulk_insert/query_buffer.py b/bulk_insert/query_buffer.py
new file mode 100644
index 0000000..924acac
--- /dev/null
+++ b/bulk_insert/query_buffer.py
@@ -0,0 +1,56 @@
+# QueryBuffer is the class that processes input CSVs and emits their binary formats to the Redis client.
+class QueryBuffer(object):
+    def __init__(self, graphname, client):
+        # Redis client and data for each query
+        self.client = client
+
+        # Sizes for buffer currently being constructed
+        self.redis_token_count = 0
+        self.buffer_size = 0
+
+        # The first query should include a "BEGIN" token
+        self.graphname = graphname
+        self.initial_query = True
+
+        self.node_count = 0
+        self.relation_count = 0
+
+        self.labels = [] # List containing all pending Label objects
+        self.reltypes = [] # List containing all pending RelationType objects
+
+        self.nodes_created = 0 # Total number of nodes created
+        self.relations_created = 0 # Total number of relations created
+
+    # Send all pending inserts to Redis
+    def send_buffer(self):
+        # Do nothing if we have no entities
+        if self.node_count == 0 and self.relation_count == 0:
+            return
+
+        args = [self.node_count, self.relation_count, len(self.labels), len(self.reltypes)] + self.labels + self.reltypes
+        # Prepend a "BEGIN" token if this is the first query
+        if self.initial_query:
+            args.insert(0, "BEGIN")
+            self.initial_query = False
+
+        result = self.client.execute_command("GRAPH.BULK", self.graphname, *args)
+        stats = result.split(', '.encode())
+        self.nodes_created += int(stats[0].split(' '.encode())[0])
+        self.relations_created += int(stats[1].split(' '.encode())[0])
+
+        self.clear_buffer()
+
+    # Delete all entities that have been inserted
+    def clear_buffer(self):
+        self.redis_token_count = 0
+        self.buffer_size = 0
+
+        # All constructed entities have been inserted, so clear buffers
+        self.node_count = 0
+        self.relation_count = 0
+        del self.labels[:]
+        del self.reltypes[:]
+
+    def report_completion(self, runtime):
+        print("Construction of graph '%s' complete: %d nodes created, %d relations created in %f seconds"
+              % (self.graphname, self.nodes_created, self.relations_created, runtime))
diff --git a/bulk_insert/relation_type.py b/bulk_insert/relation_type.py
new file mode 100644
index 0000000..a338b32
--- /dev/null
+++ b/bulk_insert/relation_type.py
@@ -0,0 +1,60 @@
+import struct
+import click
+from entity_file import EntityFile
+import module_vars
+
+# Handler class for processing relation csv files.
+class RelationType(EntityFile):
+    def __init__(self, infile, separator):
+        super(RelationType, self).__init__(infile, separator)
+        expected_col_count = self.process_header()
+        self.process_entities(expected_col_count)
+        self.infile.close()
+
+    def process_header(self):
+        # Header format:
+        # source identifier, dest identifier, properties[0..n]
+        header = next(self.reader)
+        # Assume rectangular CSVs
+        expected_col_count = len(header)
+        self.prop_count = expected_col_count - 2
+        if self.prop_count < 0:
+            raise CSVError("Relation file '%s' should have at least 2 elements in header line."
+                           % (self.infile.name))
+
+        self.prop_offset = 2
+        self.packed_header = self.pack_header(header) # skip src and dest identifiers
+        self.binary_size += len(self.packed_header)
+        return expected_col_count
+
+    def process_entities(self, expected_col_count):
+        entities_created = 0
+        with click.progressbar(self.reader, length=self.entities_count, label=self.entity_str) as reader:
+            for row in reader:
+                self.validate_row(expected_col_count, row)
+                try:
+                    src = module_vars.NODE_DICT[row[0]]
+                    dest = module_vars.NODE_DICT[row[1]]
+                except KeyError as e:
+                    print("Relationship specified a non-existent identifier. src: %s; dest: %s" % (row[0], row[1]))
+                    if module_vars.CONFIGS.skip_invalid_edges is False:
+                        raise e
+                    continue
+                fmt = "=QQ" # 8-byte unsigned ints for src and dest
+                row_binary = struct.pack(fmt, src, dest) + self.pack_props(row)
+                row_binary_len = len(row_binary)
+                # If the addition of this entity will make the binary token grow too large,
+                # send the buffer now.
+                if self.binary_size + row_binary_len > module_vars.CONFIGS.max_token_size:
+                    module_vars.QUERY_BUF.reltypes.append(self.to_binary())
+                    module_vars.QUERY_BUF.send_buffer()
+                    self.reset_partial_binary()
+                    # Push the reltype onto the query buffer again, as there are more entities to process.
+                    module_vars.QUERY_BUF.reltypes.append(self.to_binary())
+
+                module_vars.QUERY_BUF.relation_count += 1
+                entities_created += 1
+                self.binary_size += row_binary_len
+                self.binary_entities.append(row_binary)
+            module_vars.QUERY_BUF.reltypes.append(self.to_binary())
+        print("%d relations created for type '%s'" % (entities_created, self.entity_str))

From b9f8f0abaceadf453b93a43f332f6affcb0189fb Mon Sep 17 00:00:00 2001
From: Jeffrey Lovitz <jeffrey.lovitz@gmail.com>
Date: Wed, 18 Dec 2019 17:01:06 -0500
Subject: [PATCH 03/15] WIP

---
 bulk_insert/__init__.py      |  3 +++
 bulk_insert/bulk_insert.py   | 11 +++--------
 bulk_insert/entity_file.py   |  6 +++---
 bulk_insert/exceptions.py    |  3 +++
 bulk_insert/label.py         |  4 +++-
 bulk_insert/module_vars.py   |  1 -
 bulk_insert/relation_type.py |  2 ++
 7 files changed, 17 insertions(+), 13 deletions(-)
 create mode 100644 bulk_insert/exceptions.py

diff --git a/bulk_insert/__init__.py b/bulk_insert/__init__.py
index 989c1b9..f31a296 100644
--- a/bulk_insert/__init__.py
+++ b/bulk_insert/__init__.py
@@ -3,3 +3,6 @@
 from .relation_type import RelationType
 
 from .query_buffer import QueryBuffer
+from .exceptions import (
+        CSVError
+)
diff --git a/bulk_insert/bulk_insert.py b/bulk_insert/bulk_insert.py
index 6155884..29c4bbb 100644
--- a/bulk_insert/bulk_insert.py
+++ b/bulk_insert/bulk_insert.py
@@ -1,9 +1,4 @@
-import csv
-import os
-import io
 import sys
-import math
-import struct
 import json
 from timeit import default_timer as timer
 import redis
@@ -11,9 +6,10 @@
 from configs import Configs
 from query_buffer import QueryBuffer
 from label import Label
-from relation_type import RelationType 
+from relation_type import RelationType
 import module_vars
 
+
 # For each node input file, validate contents and convert to binary format.
 # If any buffer limits have been reached, flush all enqueued inserts to Redis.
 def process_entity_csvs(cls, csvs, separator):
@@ -49,8 +45,6 @@ def process_entity_csvs(cls, csvs, separator):
 @click.option('--field-types', '-f', default=None, help='json to set explicit types for each field, format {<label>:[<col1 type>, <col2 type> ...]} where type can be 0(null),1(bool),2(numeric),3(string)')
 @click.option('--skip-invalid-nodes', '-s', default=False, is_flag=True, help='ignore nodes that use previously defined IDs')
 @click.option('--skip-invalid-edges', '-e', default=False, is_flag=True, help='ignore invalid edges, print an error message and continue loading (True), or stop loading after an edge loading failure (False)')
-
-
 def bulk_insert(graph, host, port, password, nodes, relations, separator, max_token_count, max_buffer_size, max_token_size, quote, field_types, skip_invalid_nodes, skip_invalid_edges):
     if sys.version_info[0] < 3:
         raise Exception("Python 3 is required for the RedisGraph bulk loader.")
@@ -109,5 +103,6 @@ def bulk_insert(graph, host, port, password, nodes, relations, separator, max_to
     end_time = timer()
     module_vars.QUERY_BUF.report_completion(end_time - start_time)
 
+
 if __name__ == '__main__':
     bulk_insert()
diff --git a/bulk_insert/entity_file.py b/bulk_insert/entity_file.py
index c20af24..dad5425 100644
--- a/bulk_insert/entity_file.py
+++ b/bulk_insert/entity_file.py
@@ -3,10 +3,8 @@
 import csv
 import struct
 import module_vars
+from exceptions import CSVError
 
-# Custom error class for invalid inputs
-class CSVError(Exception):
-    pass
 
 # Official enum support varies widely between 2.7 and 3.x, so we'll use a custom class
 class Type:
@@ -15,6 +13,7 @@ class Type:
     NUMERIC = 2
     STRING = 3
 
+
 # Convert a single CSV property field into a binary stream.
 # Supported property types are string, numeric, boolean, and NULL.
 # type is either Type.NUMERIC, Type.BOOL or Type.STRING, and explicitly sets the value to this type if possible
@@ -51,6 +50,7 @@ def prop_to_binary(prop_val, type):
     ## if it hasn't returned by this point, it is trying to set it to a type that it can't adopt
     raise Exception("unable to parse [" + prop_val + "] with type ["+repr(type)+"]")
 
+
 # Superclass for label and relation CSV files
 class EntityFile(object):
     def __init__(self, filename, separator):
diff --git a/bulk_insert/exceptions.py b/bulk_insert/exceptions.py
new file mode 100644
index 0000000..07eaec6
--- /dev/null
+++ b/bulk_insert/exceptions.py
@@ -0,0 +1,3 @@
+# Custom error class for invalid inputs
+class CSVError(Exception):
+    pass
diff --git a/bulk_insert/label.py b/bulk_insert/label.py
index 011ad07..37cbda6 100644
--- a/bulk_insert/label.py
+++ b/bulk_insert/label.py
@@ -1,7 +1,9 @@
+import sys
 import click
 from entity_file import EntityFile
 import module_vars
 
+
 # Handler class for processing label csv files.
 class Label(EntityFile):
     def __init__(self, infile, separator):
@@ -33,7 +35,7 @@ def process_entities(self, expected_col_count):
                         sys.stderr.write("Node identifier '%s' was used multiple times - second occurrence at %s:%d\n"
                                          % (row[0], self.infile.name, self.reader.line_num))
                         if module_vars.CONFIGS.skip_invalid_nodes is False:
-                            exit(1)
+                            sys.exit(1)
                     module_vars.NODE_DICT[row[0]] = module_vars.TOP_NODE_ID
                     module_vars.TOP_NODE_ID += 1
                 row_binary = self.pack_props(row)
diff --git a/bulk_insert/module_vars.py b/bulk_insert/module_vars.py
index 28edb54..0bc86f3 100644
--- a/bulk_insert/module_vars.py
+++ b/bulk_insert/module_vars.py
@@ -7,4 +7,3 @@
 QUERY_BUF = None       # Buffer for query being constructed
 QUOTING = None
 FIELD_TYPES = None
-
diff --git a/bulk_insert/relation_type.py b/bulk_insert/relation_type.py
index a338b32..292b885 100644
--- a/bulk_insert/relation_type.py
+++ b/bulk_insert/relation_type.py
@@ -1,8 +1,10 @@
 import struct
 import click
 from entity_file import EntityFile
+from exceptions import CSVError
 import module_vars
 
+
 # Handler class for processing relation csv files.
 class RelationType(EntityFile):
     def __init__(self, infile, separator):

From 0f51542031037435604c60c211475e9479438e2d Mon Sep 17 00:00:00 2001
From: Jeffrey Lovitz <jeffrey.lovitz@gmail.com>
Date: Fri, 20 Dec 2019 09:46:06 -0500
Subject: [PATCH 04/15] WIP

---
 bulk_insert/__init__.py    |  2 +-
 bulk_insert/bulk_insert.py | 13 +++++++++----
 bulk_insert/configs.py     |  4 +++-
 bulk_insert/entity_file.py | 19 ++++++-------------
 bulk_insert/exceptions.py  |  4 ++++
 bulk_insert/label.py       | 15 +++++++++++++++
 bulk_insert/schema.py      | 26 ++++++++++++++++++++++++++
 7 files changed, 64 insertions(+), 19 deletions(-)
 create mode 100644 bulk_insert/schema.py

diff --git a/bulk_insert/__init__.py b/bulk_insert/__init__.py
index f31a296..8df1673 100644
--- a/bulk_insert/__init__.py
+++ b/bulk_insert/__init__.py
@@ -1,7 +1,7 @@
 from .configs import Configs
 from .label import Label
 from .relation_type import RelationType
-
+from .schema import *
 from .query_buffer import QueryBuffer
 from .exceptions import (
         CSVError
diff --git a/bulk_insert/bulk_insert.py b/bulk_insert/bulk_insert.py
index 29c4bbb..46436c6 100644
--- a/bulk_insert/bulk_insert.py
+++ b/bulk_insert/bulk_insert.py
@@ -45,7 +45,8 @@ def process_entity_csvs(cls, csvs, separator):
 @click.option('--field-types', '-f', default=None, help='json to set explicit types for each field, format {<label>:[<col1 type>, <col2 type> ...]} where type can be 0(null),1(bool),2(numeric),3(string)')
 @click.option('--skip-invalid-nodes', '-s', default=False, is_flag=True, help='ignore nodes that use previously defined IDs')
 @click.option('--skip-invalid-edges', '-e', default=False, is_flag=True, help='ignore invalid edges, print an error message and continue loading (True), or stop loading after an edge loading failure (False)')
-def bulk_insert(graph, host, port, password, nodes, relations, separator, max_token_count, max_buffer_size, max_token_size, quote, field_types, skip_invalid_nodes, skip_invalid_edges):
+@click.option('--enforce-schema', '-S', default=False, is_flag=True, help='header line introduces property schema')
+def bulk_insert(graph, host, port, password, nodes, relations, separator, max_token_count, max_buffer_size, max_token_size, quote, field_types, skip_invalid_nodes, skip_invalid_edges, enforce_schema):
     if sys.version_info[0] < 3:
         raise Exception("Python 3 is required for the RedisGraph bulk loader.")
 
@@ -58,7 +59,7 @@ def bulk_insert(graph, host, port, password, nodes, relations, separator, max_to
     module_vars.QUOTING = int(quote)
 
     module_vars.TOP_NODE_ID = 0 # reset global ID variable (in case we are calling bulk_insert from unit tests)
-    module_vars.CONFIGS = Configs(max_token_count, max_buffer_size, max_token_size, skip_invalid_nodes, skip_invalid_edges)
+    module_vars.CONFIGS = Configs(max_token_count, max_buffer_size, max_token_size, skip_invalid_nodes, skip_invalid_edges, enforce_schema)
 
     start_time = timer()
     # Attempt to connect to Redis server
@@ -73,7 +74,7 @@ def bulk_insert(graph, host, port, password, nodes, relations, separator, max_to
         module_list = client.execute_command("MODULE LIST")
         if not any(b'graph' in module_description for module_description in module_list):
             print("RedisGraph module not loaded on connected server.")
-            exit(1)
+            sys.exit(1)
     except redis.exceptions.ResponseError:
         # Ignore check if the connected server does not support the "MODULE LIST" command
         pass
@@ -82,7 +83,11 @@ def bulk_insert(graph, host, port, password, nodes, relations, separator, max_to
     key_exists = client.execute_command("EXISTS", graph)
     if key_exists:
         print("Graph with name '%s', could not be created, as Redis key '%s' already exists." % (graph, graph))
-        exit(1)
+        sys.exit(1)
+
+    # If we're enforcing a schema, validate the headers in each file?
+    if enforce_schema:
+        pass
 
     module_vars.QUERY_BUF = QueryBuffer(graph, client)
 
diff --git a/bulk_insert/configs.py b/bulk_insert/configs.py
index fc0028d..a9683a0 100644
--- a/bulk_insert/configs.py
+++ b/bulk_insert/configs.py
@@ -1,6 +1,6 @@
 # User-configurable thresholds for when to send queries to Redis
 class Configs(object):
-    def __init__(self, max_token_count, max_buffer_size, max_token_size, skip_invalid_nodes, skip_invalid_edges):
+    def __init__(self, max_token_count, max_buffer_size, max_token_size, skip_invalid_nodes, skip_invalid_edges, enforce_schema):
         # Maximum number of tokens per query
         # 1024 * 1024 is the hard-coded Redis maximum. We'll set a slightly lower limit so
         # that we can safely ignore tokens that aren't binary strings
@@ -14,3 +14,5 @@ def __init__(self, max_token_count, max_buffer_size, max_token_size, skip_invali
 
         self.skip_invalid_nodes = skip_invalid_nodes
         self.skip_invalid_edges = skip_invalid_edges
+
+        self.enforce_schema = enforce_schema
diff --git a/bulk_insert/entity_file.py b/bulk_insert/entity_file.py
index dad5425..d648260 100644
--- a/bulk_insert/entity_file.py
+++ b/bulk_insert/entity_file.py
@@ -4,19 +4,12 @@
 import struct
 import module_vars
 from exceptions import CSVError
-
-
-# Official enum support varies widely between 2.7 and 3.x, so we'll use a custom class
-class Type:
-    NULL = 0
-    BOOL = 1
-    NUMERIC = 2
-    STRING = 3
+import schema
 
 
 # Convert a single CSV property field into a binary stream.
 # Supported property types are string, numeric, boolean, and NULL.
-# type is either Type.NUMERIC, Type.BOOL or Type.STRING, and explicitly sets the value to this type if possible
+# type is either Type.DOUBLE, Type.BOOL or Type.STRING, and explicitly sets the value to this type if possible
 def prop_to_binary(prop_val, type):
     # All format strings start with an unsigned char to represent our Type enum
     format_str = "=B"
@@ -25,11 +18,11 @@ def prop_to_binary(prop_val, type):
         return struct.pack(format_str, Type.NULL)
 
     # If field can be cast to a float, allow it
-    if type == None or type == Type.NUMERIC:
+    if type == None or type == Type.DOUBLE:
         try:
             numeric_prop = float(prop_val)
             if not math.isnan(numeric_prop) and not math.isinf(numeric_prop): # Don't accept non-finite values.
-                return struct.pack(format_str + "d", Type.NUMERIC, numeric_prop)
+                return struct.pack(format_str + "d", Type.DOUBLE, numeric_prop)
         except:
             pass
 
@@ -45,9 +38,9 @@ def prop_to_binary(prop_val, type):
         encoded_str = str.encode(prop_val) # struct.pack requires bytes objects as arguments
         # Encoding len+1 adds a null terminator to the string
         format_str += "%ds" % (len(encoded_str) + 1)
-        return struct.pack(format_str, Type.STRING, encoded_str)
+        return struct.pack(format_str, schema.Type.STRING, encoded_str)
 
-    ## if it hasn't returned by this point, it is trying to set it to a type that it can't adopt
+    # If it hasn't returned by this point, it is trying to set it to a type that it can't adopt
     raise Exception("unable to parse [" + prop_val + "] with type ["+repr(type)+"]")
 
 
diff --git a/bulk_insert/exceptions.py b/bulk_insert/exceptions.py
index 07eaec6..38e6baa 100644
--- a/bulk_insert/exceptions.py
+++ b/bulk_insert/exceptions.py
@@ -1,3 +1,7 @@
 # Custom error class for invalid inputs
 class CSVError(Exception):
     pass
+
+
+class SchemaError(Exception):
+    pass
diff --git a/bulk_insert/label.py b/bulk_insert/label.py
index 37cbda6..e18843a 100644
--- a/bulk_insert/label.py
+++ b/bulk_insert/label.py
@@ -1,7 +1,10 @@
 import sys
 import click
 from entity_file import EntityFile
+from configs import Configs
+from exceptions import SchemaError
 import module_vars
+import schema
 
 
 # Handler class for processing label csv files.
@@ -12,11 +15,23 @@ def __init__(self, infile, separator):
         self.process_entities(expected_col_count)
         self.infile.close()
 
+    def process_header_schema(self, header):
+        prop_count = len(header)
+        self.types = [None] * prop_count
+        for i, prop in enumerate(header):
+            pair = prop.split(':')
+            if len(pair) != 2:
+                raise SchemaError("Each header entry should be a colon-separated pair")
+            self.types[i] = schema.convert_schema_type(pair[1].casefold())
+
     def process_header(self):
         # Header format:
         # node identifier (which may be a property key), then all other property keys
         header = next(self.reader)
         expected_col_count = len(header)
+
+        if module_vars.CONFIGS.enforce_schema:
+            self.process_header_schema(header)
         # If identifier field begins with an underscore, don't add it as a property.
         if header[0][0] == '_':
             self.prop_offset = 1
diff --git a/bulk_insert/schema.py b/bulk_insert/schema.py
new file mode 100644
index 0000000..7c4267c
--- /dev/null
+++ b/bulk_insert/schema.py
@@ -0,0 +1,26 @@
+# Official enum support varies widely between 2.7 and 3.x, so we'll use a custom class
+class Type:
+    NULL = 0
+    BOOL = 1
+    DOUBLE = 2
+    STRING = 3
+    INTEGER = 4
+    ID = 5
+    LABEL = 6
+    TYPE = 7
+    START_ID = 8
+    END_ID = 9
+
+
+def convert_schema_type(in_type):
+    return {
+        'null': Type.NULL,
+        'boolean': Type.BOOL,
+        'double': Type.DOUBLE,
+        'string': Type.STRING,
+        'integer': Type.INTEGER,
+        'id': Type.ID,
+        'label': Type.LABEL,
+        'start_id': Type.START_ID,
+        'end_id': Type.END_ID
+        }[in_type]

From 5501e43d4e0bb2514f7d86db63ebc13256c41625 Mon Sep 17 00:00:00 2001
From: Jeffrey Lovitz <jeffrey.lovitz@gmail.com>
Date: Fri, 20 Dec 2019 14:11:20 -0500
Subject: [PATCH 05/15] Refactor schema reading in header

---
 bulk_insert/bulk_insert.py   |  39 +++++++-------
 bulk_insert/configs.py       |   4 +-
 bulk_insert/entity_file.py   | 101 ++++++++++++++++++++++++-----------
 bulk_insert/label.py         |  43 ++++-----------
 bulk_insert/module_vars.py   |   1 -
 bulk_insert/relation_type.py |  52 +++++++++---------
 bulk_insert/schema.py        |   2 +
 7 files changed, 130 insertions(+), 112 deletions(-)

diff --git a/bulk_insert/bulk_insert.py b/bulk_insert/bulk_insert.py
index 46436c6..9add85f 100644
--- a/bulk_insert/bulk_insert.py
+++ b/bulk_insert/bulk_insert.py
@@ -10,12 +10,19 @@
 import module_vars
 
 
-# For each node input file, validate contents and convert to binary format.
-# If any buffer limits have been reached, flush all enqueued inserts to Redis.
-def process_entity_csvs(cls, csvs, separator):
-    for in_csv in csvs:
+def parse_schemas(cls, csvs):
+    schemas = [None] * len(csvs)
+    for idx, in_csv in enumerate(csvs):
         # Build entity descriptor from input CSV
-        entity = cls(in_csv, separator)
+        schemas[idx] = cls(in_csv)
+    return schemas
+
+
+# For each input file, validate contents and convert to binary format.
+# If any buffer limits have been reached, flush all enqueued inserts to Redis.
+def process_entities(entities):
+    for entity in entities:
+        entity.process_entities()
         added_size = entity.binary_size
         # Check to see if the addition of this data will exceed the buffer's capacity
         if (module_vars.QUERY_BUF.buffer_size + added_size >= module_vars.CONFIGS.max_buffer_size
@@ -42,24 +49,16 @@ def process_entity_csvs(cls, csvs, separator):
 @click.option('--max-buffer-size', '-b', default=2048, help='max buffer size in megabytes (default 2048)')
 @click.option('--max-token-size', '-t', default=500, help='max size of each token in megabytes (default 500, max 512)')
 @click.option('--quote', '-q', default=3, help='the quoting format used in the CSV file. QUOTE_MINIMAL=0,QUOTE_ALL=1,QUOTE_NONNUMERIC=2,QUOTE_NONE=3')
-@click.option('--field-types', '-f', default=None, help='json to set explicit types for each field, format {<label>:[<col1 type>, <col2 type> ...]} where type can be 0(null),1(bool),2(numeric),3(string)')
 @click.option('--skip-invalid-nodes', '-s', default=False, is_flag=True, help='ignore nodes that use previously defined IDs')
 @click.option('--skip-invalid-edges', '-e', default=False, is_flag=True, help='ignore invalid edges, print an error message and continue loading (True), or stop loading after an edge loading failure (False)')
-@click.option('--enforce-schema', '-S', default=False, is_flag=True, help='header line introduces property schema')
-def bulk_insert(graph, host, port, password, nodes, relations, separator, max_token_count, max_buffer_size, max_token_size, quote, field_types, skip_invalid_nodes, skip_invalid_edges, enforce_schema):
+def bulk_insert(graph, host, port, password, nodes, relations, separator, max_token_count, max_buffer_size, max_token_size, quote, skip_invalid_nodes, skip_invalid_edges):
     if sys.version_info[0] < 3:
         raise Exception("Python 3 is required for the RedisGraph bulk loader.")
 
-    if field_types is not None:
-        try:
-            module_vars.FIELD_TYPES = json.loads(field_types)
-        except:
-            raise Exception("Problem parsing field-types. Use the format {<label>:[<col1 type>, <col2 type> ...]} where type can be 0(null),1(bool),2(numeric),3(string) ")
-
     module_vars.QUOTING = int(quote)
 
     module_vars.TOP_NODE_ID = 0 # reset global ID variable (in case we are calling bulk_insert from unit tests)
-    module_vars.CONFIGS = Configs(max_token_count, max_buffer_size, max_token_size, skip_invalid_nodes, skip_invalid_edges, enforce_schema)
+    module_vars.CONFIGS = Configs(max_token_count, max_buffer_size, max_token_size, skip_invalid_nodes, skip_invalid_edges, separator)
 
     start_time = timer()
     # Attempt to connect to Redis server
@@ -85,9 +84,9 @@ def bulk_insert(graph, host, port, password, nodes, relations, separator, max_to
         print("Graph with name '%s', could not be created, as Redis key '%s' already exists." % (graph, graph))
         sys.exit(1)
 
-    # If we're enforcing a schema, validate the headers in each file?
-    if enforce_schema:
-        pass
+    # Read the header rows of each input CSV and save its schema.
+    labels = parse_schemas(Label, nodes)
+    reltypes = parse_schemas(RelationType, relations)
 
     module_vars.QUERY_BUF = QueryBuffer(graph, client)
 
@@ -97,10 +96,10 @@ def bulk_insert(graph, host, port, password, nodes, relations, separator, max_to
     else:
         module_vars.NODE_DICT = None
 
-    process_entity_csvs(Label, nodes, separator)
+    process_entities(labels)
 
     if relations:
-        process_entity_csvs(RelationType, relations, separator)
+        process_entities(reltypes)
 
     # Send all remaining tokens to Redis
     module_vars.QUERY_BUF.send_buffer()
diff --git a/bulk_insert/configs.py b/bulk_insert/configs.py
index a9683a0..9fa937b 100644
--- a/bulk_insert/configs.py
+++ b/bulk_insert/configs.py
@@ -1,6 +1,6 @@
 # User-configurable thresholds for when to send queries to Redis
 class Configs(object):
-    def __init__(self, max_token_count, max_buffer_size, max_token_size, skip_invalid_nodes, skip_invalid_edges, enforce_schema):
+    def __init__(self, max_token_count, max_buffer_size, max_token_size, skip_invalid_nodes, skip_invalid_edges, separator):
         # Maximum number of tokens per query
         # 1024 * 1024 is the hard-coded Redis maximum. We'll set a slightly lower limit so
         # that we can safely ignore tokens that aren't binary strings
@@ -15,4 +15,4 @@ def __init__(self, max_token_count, max_buffer_size, max_token_size, skip_invali
         self.skip_invalid_nodes = skip_invalid_nodes
         self.skip_invalid_edges = skip_invalid_edges
 
-        self.enforce_schema = enforce_schema
+        self.separator = separator
diff --git a/bulk_insert/entity_file.py b/bulk_insert/entity_file.py
index d648260..af49c6f 100644
--- a/bulk_insert/entity_file.py
+++ b/bulk_insert/entity_file.py
@@ -1,84 +1,94 @@
 import os
 import io
 import csv
+import math
 import struct
 import module_vars
-from exceptions import CSVError
-import schema
+import configs
+from exceptions import CSVError, SchemaError
+from schema import Type, convert_schema_type
 
 
 # Convert a single CSV property field into a binary stream.
 # Supported property types are string, numeric, boolean, and NULL.
 # type is either Type.DOUBLE, Type.BOOL or Type.STRING, and explicitly sets the value to this type if possible
-def prop_to_binary(prop_val, type):
-    # All format strings start with an unsigned char to represent our Type enum
+def prop_to_binary(prop_val, prop_type):
+    # All format strings start with an unsigned char to represent our prop_type enum
     format_str = "=B"
     if prop_val is None:
         # An empty field indicates a NULL property
         return struct.pack(format_str, Type.NULL)
 
     # If field can be cast to a float, allow it
-    if type == None or type == Type.DOUBLE:
+    if prop_type is None or prop_type == Type.DOUBLE:
         try:
             numeric_prop = float(prop_val)
             if not math.isnan(numeric_prop) and not math.isinf(numeric_prop): # Don't accept non-finite values.
                 return struct.pack(format_str + "d", Type.DOUBLE, numeric_prop)
         except:
-            pass
+            raise SchemaError("Could not parse '%s' as a double" % prop_val)
 
-    if type == None or type == Type.BOOL:
+    if prop_type is None or prop_type == Type.BOOL:
         # If field is 'false' or 'true', it is a boolean
         if prop_val.lower() == 'false':
             return struct.pack(format_str + '?', Type.BOOL, False)
         elif prop_val.lower() == 'true':
             return struct.pack(format_str + '?', Type.BOOL, True)
 
-    if type == None or type == Type.STRING:
+    if prop_type is None or prop_type == Type.STRING:
         # If we've reached this point, the property is a string
         encoded_str = str.encode(prop_val) # struct.pack requires bytes objects as arguments
         # Encoding len+1 adds a null terminator to the string
         format_str += "%ds" % (len(encoded_str) + 1)
-        return struct.pack(format_str, schema.Type.STRING, encoded_str)
+        return struct.pack(format_str, Type.STRING, encoded_str)
 
+    if prop_type in (Type.LABEL, Type.TYPE, Type.ID): # TODO tmp, treat as string for testing
+        encoded_str = str.encode(prop_val) # struct.pack requires bytes objects as arguments
+        # Encoding len+1 adds a null terminator to the string
+        format_str += "%ds" % (len(encoded_str) + 1)
+        return struct.pack(format_str, Type.STRING, encoded_str)
+
+    import ipdb
+    ipdb.set_trace()
     # If it hasn't returned by this point, it is trying to set it to a type that it can't adopt
     raise Exception("unable to parse [" + prop_val + "] with type ["+repr(type)+"]")
 
 
 # Superclass for label and relation CSV files
 class EntityFile(object):
-    def __init__(self, filename, separator):
+    def __init__(self, filename):
         # The label or relation type string is the basename of the file
         self.entity_str = os.path.splitext(os.path.basename(filename))[0]
         # Input file handling
         self.infile = io.open(filename, 'rt')
         # Initialize CSV reader that ignores leading whitespace in each field
         # and does not modify input quote characters
-        self.reader = csv.reader(self.infile, delimiter=separator, skipinitialspace=True, quoting=module_vars.QUOTING)
-
-        self.prop_offset = 0 # Starting index of properties in row
-        self.prop_count = 0 # Number of properties per entity
+        self.reader = csv.reader(self.infile, delimiter=module_vars.CONFIGS.separator, skipinitialspace=True, quoting=module_vars.QUOTING)
 
         self.packed_header = b''
         self.binary_entities = []
         self.binary_size = 0 # size of binary token
+
+        # Extract data from header row.
+        self.convert_header()
+
         self.count_entities() # number of entities/row in file.
+        next(self.reader) # Skip header for next read.
 
     # Count number of rows in file.
     def count_entities(self):
         self.entities_count = 0
         self.entities_count = sum(1 for line in self.infile)
-        # discard header row
-        self.entities_count -= 1
         # seek back
         self.infile.seek(0)
         return self.entities_count
 
     # Simple input validations for each row of a CSV file
-    def validate_row(self, expected_col_count, row):
+    def validate_row(self, row):
         # Each row should have the same number of fields
-        if len(row) != expected_col_count:
+        if len(row) != self.column_count:
             raise CSVError("%s:%d Expected %d columns, encountered %d ('%s')"
-                           % (self.infile.name, self.reader.line_num, expected_col_count, len(row), ','.join(row)))
+                           % (self.infile.name, self.reader.line_num, self.column_count, len(row), configs.separator.join(row)))
 
     # If part of a CSV file was sent to Redis, delete the processed entities and update the binary size
     def reset_partial_binary(self):
@@ -86,29 +96,56 @@ def reset_partial_binary(self):
         self.binary_size = len(self.packed_header)
 
     # Convert property keys from a CSV file header into a binary string
-    def pack_header(self, header):
-        prop_count = len(header) - self.prop_offset
+    def pack_header(self):
         # String format
         entity_bytes = self.entity_str.encode()
         fmt = "=%dsI" % (len(entity_bytes) + 1) # Unaligned native, entity name, count of properties
-        args = [entity_bytes, prop_count]
-        for p in header[self.prop_offset:]:
-            prop = p.encode()
+        args = [entity_bytes, self.prop_count]
+        for idx in range(self.column_count):
+            if self.skip_offsets[idx]:
+                continue
+            prop = self.column_names[idx].encode()
             fmt += "%ds" % (len(prop) + 1) # encode string with a null terminator
             args.append(prop)
         return struct.pack(fmt, *args)
 
+    # Extract column names and types from a header row
+    def convert_header(self):
+        header = next(self.reader)
+        self.column_count = len(header)
+        self.column_names = [None] * self.column_count   # Property names of every column.
+        self.types = [None] * self.column_count          # Value type of every column.
+        self.skip_offsets = [False] * self.column_count  # Whether column at any offset should not be stored as a property.
+
+        for idx, field in enumerate(header):
+            pair = field.split(':')
+            if len(pair) > 2:
+                raise CSVError("Field '%s' had %d colons" % field, len(field))
+            elif len(pair) < 2:
+                self.types[idx] = convert_schema_type(pair[0].casefold())
+                self.skip_offsets[idx] = True
+                if self.types[idx] not in (Type.ID, Type.START_ID, Type.END_ID, Type.IGNORE):
+                    # Any other field should have 2 elements
+                    raise SchemaError("Each property in the header should be a colon-separated pair")
+            else:
+                self.column_names[idx] = pair[0]
+                self.types[idx] = convert_schema_type(pair[1].casefold())
+                if self.types[idx] in (Type.START_ID, Type.END_ID, Type.IGNORE):
+                    self.skip_offsets[idx] = True
+
+        # The number of properties is equal to the number of non-skipped columns.
+        self.prop_count = self.skip_offsets.count(False)
+        self.packed_header = self.pack_header()
+        self.binary_size += len(self.packed_header)
+
     # Convert a list of properties into a binary string
     def pack_props(self, line):
         props = []
-        for num, field in enumerate(line[self.prop_offset:]):
-            field_type_idx = self.prop_offset+num
-            try:
-                module_vars.FIELD_TYPES[self.entity_str][field_type_idx]
-            except:
-                props.append(prop_to_binary(field, None))
-            else:
-                props.append(prop_to_binary(field, module_vars.FIELD_TYPES[self.entity_str][field_type_idx]))
+        for idx, field in enumerate(line):
+            if self.skip_offsets[idx]:
+                continue
+            if self.column_names[idx]:
+                props.append(prop_to_binary(field, self.types[idx]))
         return b''.join(p for p in props)
 
     def to_binary(self):
diff --git a/bulk_insert/label.py b/bulk_insert/label.py
index e18843a..bc27c7c 100644
--- a/bulk_insert/label.py
+++ b/bulk_insert/label.py
@@ -4,46 +4,24 @@
 from configs import Configs
 from exceptions import SchemaError
 import module_vars
-import schema
+from schema import Type
+from schema import convert_schema_type
 
 
 # Handler class for processing label csv files.
 class Label(EntityFile):
-    def __init__(self, infile, separator):
-        super(Label, self).__init__(infile, separator)
-        expected_col_count = self.process_header()
-        self.process_entities(expected_col_count)
-        self.infile.close()
-
-    def process_header_schema(self, header):
-        prop_count = len(header)
-        self.types = [None] * prop_count
-        for i, prop in enumerate(header):
-            pair = prop.split(':')
-            if len(pair) != 2:
-                raise SchemaError("Each header entry should be a colon-separated pair")
-            self.types[i] = schema.convert_schema_type(pair[1].casefold())
-
-    def process_header(self):
-        # Header format:
-        # node identifier (which may be a property key), then all other property keys
-        header = next(self.reader)
-        expected_col_count = len(header)
+    def __init__(self, infile):
+        super(Label, self).__init__(infile)
+        # Verify that exactly one field is labeled ID.
+        if self.types.count(Type.ID) != 1:
+            raise SchemaError("Node file '%s' should have exactly one ID column."
+                              % (infile.name))
 
-        if module_vars.CONFIGS.enforce_schema:
-            self.process_header_schema(header)
-        # If identifier field begins with an underscore, don't add it as a property.
-        if header[0][0] == '_':
-            self.prop_offset = 1
-        self.packed_header = self.pack_header(header)
-        self.binary_size += len(self.packed_header)
-        return expected_col_count
-
-    def process_entities(self, expected_col_count):
+    def process_entities(self):
         entities_created = 0
         with click.progressbar(self.reader, length=self.entities_count, label=self.entity_str) as reader:
             for row in reader:
-                self.validate_row(expected_col_count, row)
+                self.validate_row(row)
                 # Add identifier->ID pair to dictionary if we are building relations
                 if module_vars.NODE_DICT is not None:
                     if row[0] in module_vars.NODE_DICT:
@@ -69,4 +47,5 @@ def process_entities(self, expected_col_count):
                 self.binary_size += row_binary_len
                 self.binary_entities.append(row_binary)
             module_vars.QUERY_BUF.labels.append(self.to_binary())
+        self.infile.close()
         print("%d nodes created with label '%s'" % (entities_created, self.entity_str))
diff --git a/bulk_insert/module_vars.py b/bulk_insert/module_vars.py
index 0bc86f3..8368218 100644
--- a/bulk_insert/module_vars.py
+++ b/bulk_insert/module_vars.py
@@ -6,4 +6,3 @@
 TOP_NODE_ID = 0        # next ID to assign to a node
 QUERY_BUF = None       # Buffer for query being constructed
 QUOTING = None
-FIELD_TYPES = None
diff --git a/bulk_insert/relation_type.py b/bulk_insert/relation_type.py
index 292b885..aad12fe 100644
--- a/bulk_insert/relation_type.py
+++ b/bulk_insert/relation_type.py
@@ -1,44 +1,45 @@
 import struct
 import click
 from entity_file import EntityFile
-from exceptions import CSVError
+from exceptions import CSVError, SchemaError
 import module_vars
+from schema import Type
 
 
 # Handler class for processing relation csv files.
 class RelationType(EntityFile):
-    def __init__(self, infile, separator):
-        super(RelationType, self).__init__(infile, separator)
-        expected_col_count = self.process_header()
-        self.process_entities(expected_col_count)
-        self.infile.close()
-
-    def process_header(self):
-        # Header format:
-        # source identifier, dest identifier, properties[0..n]
-        header = next(self.reader)
-        # Assume rectangular CSVs
-        expected_col_count = len(header)
-        self.prop_count = expected_col_count - 2
-        if self.prop_count < 0:
+    def __init__(self, infile):
+        super(RelationType, self).__init__(infile)
+        if self.column_count < 2:
             raise CSVError("Relation file '%s' should have at least 2 elements in header line."
-                           % (self.infile.name))
+                           % (infile.name))
+
+        self.start_id = -1
+        self.end_id = -1
+        self.post_process_header()
 
-        self.prop_offset = 2
-        self.packed_header = self.pack_header(header) # skip src and dest identifiers
-        self.binary_size += len(self.packed_header)
-        return expected_col_count
+    def post_process_header(self):
+        # Can interleave these tasks if preferred.
+        if self.types.count(Type.START_ID) != 1:
+            raise SchemaError("Relation file '%s' should have exactly one START_ID column."
+                              % (self.infile.name))
+        if self.types.count(Type.END_ID) != 1:
+            raise SchemaError("Relation file '%s' should have exactly one END_ID column."
+                              % (self.infile.name))
 
-    def process_entities(self, expected_col_count):
+        self.start_id = self.types.index(Type.START_ID)
+        self.end_id = self.types.index(Type.END_ID)
+
+    def process_entities(self):
         entities_created = 0
         with click.progressbar(self.reader, length=self.entities_count, label=self.entity_str) as reader:
             for row in reader:
-                self.validate_row(expected_col_count, row)
+                self.validate_row(row)
                 try:
-                    src = module_vars.NODE_DICT[row[0]]
-                    dest = module_vars.NODE_DICT[row[1]]
+                    src = module_vars.NODE_DICT[row[self.start_id]]
+                    dest = module_vars.NODE_DICT[row[self.end_id]]
                 except KeyError as e:
-                    print("Relationship specified a non-existent identifier. src: %s; dest: %s" % (row[0], row[1]))
+                    print("Relationship specified a non-existent identifier. src: %s; dest: %s" % (row[self.start_id], row[self.end_id]))
                     if module_vars.CONFIGS.skip_invalid_edges is False:
                         raise e
                     continue
@@ -59,4 +60,5 @@ def process_entities(self, expected_col_count):
                 self.binary_size += row_binary_len
                 self.binary_entities.append(row_binary)
             module_vars.QUERY_BUF.reltypes.append(self.to_binary())
+        self.infile.close()
         print("%d relations created for type '%s'" % (entities_created, self.entity_str))
diff --git a/bulk_insert/schema.py b/bulk_insert/schema.py
index 7c4267c..1f9b590 100644
--- a/bulk_insert/schema.py
+++ b/bulk_insert/schema.py
@@ -10,6 +10,7 @@ class Type:
     TYPE = 7
     START_ID = 8
     END_ID = 9
+    IGNORE = 10
 
 
 def convert_schema_type(in_type):
@@ -21,6 +22,7 @@ def convert_schema_type(in_type):
         'integer': Type.INTEGER,
         'id': Type.ID,
         'label': Type.LABEL,
+        'type': Type.TYPE,
         'start_id': Type.START_ID,
         'end_id': Type.END_ID
         }[in_type]

From 6c1ab750d7347d8ddee5a0ba59109b35ab11efa4 Mon Sep 17 00:00:00 2001
From: Jeffrey Lovitz <jeffrey.lovitz@gmail.com>
Date: Fri, 20 Dec 2019 17:33:56 -0500
Subject: [PATCH 06/15] WIP reorganize

---
 bulk_insert/__init__.py      |  5 ++++-
 bulk_insert/bulk_insert.py   | 10 ++++------
 bulk_insert/configs.py       | 15 ++++++++++++++-
 bulk_insert/entity_file.py   |  5 ++---
 bulk_insert/label.py         |  5 ++---
 bulk_insert/module_vars.py   |  2 --
 bulk_insert/relation_type.py |  5 +++--
 7 files changed, 29 insertions(+), 18 deletions(-)

diff --git a/bulk_insert/__init__.py b/bulk_insert/__init__.py
index 8df1673..214e198 100644
--- a/bulk_insert/__init__.py
+++ b/bulk_insert/__init__.py
@@ -4,5 +4,8 @@
 from .schema import *
 from .query_buffer import QueryBuffer
 from .exceptions import (
-        CSVError
+        CSVError,
+        SchemaError
 )
+#  from .module_vars import *
+#  from .entity_file import EntityFile
diff --git a/bulk_insert/bulk_insert.py b/bulk_insert/bulk_insert.py
index 9add85f..08e603a 100644
--- a/bulk_insert/bulk_insert.py
+++ b/bulk_insert/bulk_insert.py
@@ -25,8 +25,8 @@ def process_entities(entities):
         entity.process_entities()
         added_size = entity.binary_size
         # Check to see if the addition of this data will exceed the buffer's capacity
-        if (module_vars.QUERY_BUF.buffer_size + added_size >= module_vars.CONFIGS.max_buffer_size
-                or module_vars.QUERY_BUF.redis_token_count + len(entity.binary_entities) >= module_vars.CONFIGS.max_token_count):
+        if (module_vars.QUERY_BUF.buffer_size + added_size >= Configs.max_buffer_size
+                or module_vars.QUERY_BUF.redis_token_count + len(entity.binary_entities) >= Configs.max_token_count):
             # Send and flush the buffer if appropriate
             module_vars.QUERY_BUF.send_buffer()
         # Add binary data to list and update all counts
@@ -55,10 +55,8 @@ def bulk_insert(graph, host, port, password, nodes, relations, separator, max_to
     if sys.version_info[0] < 3:
         raise Exception("Python 3 is required for the RedisGraph bulk loader.")
 
-    module_vars.QUOTING = int(quote)
-
-    module_vars.TOP_NODE_ID = 0 # reset global ID variable (in case we are calling bulk_insert from unit tests)
-    module_vars.CONFIGS = Configs(max_token_count, max_buffer_size, max_token_size, skip_invalid_nodes, skip_invalid_edges, separator)
+    # Initialize configurations with command-line arguments
+    Configs(max_token_count, max_buffer_size, max_token_size, skip_invalid_nodes, skip_invalid_edges, separator, int(quote))
 
     start_time = timer()
     # Attempt to connect to Redis server
diff --git a/bulk_insert/configs.py b/bulk_insert/configs.py
index 9fa937b..79fd0d7 100644
--- a/bulk_insert/configs.py
+++ b/bulk_insert/configs.py
@@ -1,6 +1,15 @@
 # User-configurable thresholds for when to send queries to Redis
 class Configs(object):
-    def __init__(self, max_token_count, max_buffer_size, max_token_size, skip_invalid_nodes, skip_invalid_edges, separator):
+    max_token_count = 1024 * 1023
+    max_buffer_size = 0
+    max_token_size = 512 * 1000000
+    skip_invalid_nodes = False
+    skip_invalid_edges = False
+    separator = ','
+    quoting = 3
+    top_node_id = 0 # reset global ID variable (in case we are calling bulk_insert from unit tests) # TODO del
+
+    def __init__(self, max_token_count, max_buffer_size, max_token_size, skip_invalid_nodes, skip_invalid_edges, separator, quoting):
         # Maximum number of tokens per query
         # 1024 * 1024 is the hard-coded Redis maximum. We'll set a slightly lower limit so
         # that we can safely ignore tokens that aren't binary strings
@@ -16,3 +25,7 @@ def __init__(self, max_token_count, max_buffer_size, max_token_size, skip_invali
         self.skip_invalid_edges = skip_invalid_edges
 
         self.separator = separator
+
+        self.quoting = quoting
+
+        self.top_node_id = 0 # reset global ID variable (in case we are calling bulk_insert from unit tests) # TODO del
diff --git a/bulk_insert/entity_file.py b/bulk_insert/entity_file.py
index af49c6f..27c76f2 100644
--- a/bulk_insert/entity_file.py
+++ b/bulk_insert/entity_file.py
@@ -3,8 +3,7 @@
 import csv
 import math
 import struct
-import module_vars
-import configs
+from configs import Configs
 from exceptions import CSVError, SchemaError
 from schema import Type, convert_schema_type
 
@@ -63,7 +62,7 @@ def __init__(self, filename):
         self.infile = io.open(filename, 'rt')
         # Initialize CSV reader that ignores leading whitespace in each field
         # and does not modify input quote characters
-        self.reader = csv.reader(self.infile, delimiter=module_vars.CONFIGS.separator, skipinitialspace=True, quoting=module_vars.QUOTING)
+        self.reader = csv.reader(self.infile, delimiter=Configs.separator, skipinitialspace=True, quoting=Configs.quoting)
 
         self.packed_header = b''
         self.binary_entities = []
diff --git a/bulk_insert/label.py b/bulk_insert/label.py
index bc27c7c..0643ea9 100644
--- a/bulk_insert/label.py
+++ b/bulk_insert/label.py
@@ -5,7 +5,6 @@
 from exceptions import SchemaError
 import module_vars
 from schema import Type
-from schema import convert_schema_type
 
 
 # Handler class for processing label csv files.
@@ -27,7 +26,7 @@ def process_entities(self):
                     if row[0] in module_vars.NODE_DICT:
                         sys.stderr.write("Node identifier '%s' was used multiple times - second occurrence at %s:%d\n"
                                          % (row[0], self.infile.name, self.reader.line_num))
-                        if module_vars.CONFIGS.skip_invalid_nodes is False:
+                        if Configs.skip_invalid_nodes is False:
                             sys.exit(1)
                     module_vars.NODE_DICT[row[0]] = module_vars.TOP_NODE_ID
                     module_vars.TOP_NODE_ID += 1
@@ -35,7 +34,7 @@ def process_entities(self):
                 row_binary_len = len(row_binary)
                 # If the addition of this entity will make the binary token grow too large,
                 # send the buffer now.
-                if self.binary_size + row_binary_len > module_vars.CONFIGS.max_token_size:
+                if self.binary_size + row_binary_len > Configs.max_token_size:
                     module_vars.QUERY_BUF.labels.append(self.to_binary())
                     module_vars.QUERY_BUF.send_buffer()
                     self.reset_partial_binary()
diff --git a/bulk_insert/module_vars.py b/bulk_insert/module_vars.py
index 8368218..2b73a30 100644
--- a/bulk_insert/module_vars.py
+++ b/bulk_insert/module_vars.py
@@ -1,8 +1,6 @@
 # TODO Get rid of this whole thing
 
 # Global variables
-CONFIGS = None         # thresholds for batching Redis queries
 NODE_DICT = {}         # global node dictionary
 TOP_NODE_ID = 0        # next ID to assign to a node
 QUERY_BUF = None       # Buffer for query being constructed
-QUOTING = None
diff --git a/bulk_insert/relation_type.py b/bulk_insert/relation_type.py
index aad12fe..2e225dc 100644
--- a/bulk_insert/relation_type.py
+++ b/bulk_insert/relation_type.py
@@ -3,6 +3,7 @@
 from entity_file import EntityFile
 from exceptions import CSVError, SchemaError
 import module_vars
+from configs import Configs
 from schema import Type
 
 
@@ -40,7 +41,7 @@ def process_entities(self):
                     dest = module_vars.NODE_DICT[row[self.end_id]]
                 except KeyError as e:
                     print("Relationship specified a non-existent identifier. src: %s; dest: %s" % (row[self.start_id], row[self.end_id]))
-                    if module_vars.CONFIGS.skip_invalid_edges is False:
+                    if Configs.skip_invalid_edges is False:
                         raise e
                     continue
                 fmt = "=QQ" # 8-byte unsigned ints for src and dest
@@ -48,7 +49,7 @@ def process_entities(self):
                 row_binary_len = len(row_binary)
                 # If the addition of this entity will make the binary token grow too large,
                 # send the buffer now.
-                if self.binary_size + row_binary_len > module_vars.CONFIGS.max_token_size:
+                if self.binary_size + row_binary_len > Configs.max_token_size:
                     module_vars.QUERY_BUF.reltypes.append(self.to_binary())
                     module_vars.QUERY_BUF.send_buffer()
                     self.reset_partial_binary()

From fcb4794635756e038f19c97c810eb52564ddd4f2 Mon Sep 17 00:00:00 2001
From: Jeffrey Lovitz <jeffrey.lovitz@gmail.com>
Date: Fri, 20 Dec 2019 18:11:17 -0500
Subject: [PATCH 07/15] wip

---
 .gitignore                   |  1 +
 bulk_insert/__init__.py      |  1 -
 bulk_insert/bulk_insert.py   | 34 +++++++++++++---------------------
 bulk_insert/entity_file.py   |  4 +++-
 bulk_insert/label.py         | 23 +++++++++++------------
 bulk_insert/module_vars.py   |  6 ------
 bulk_insert/query_buffer.py  |  9 ++++++++-
 bulk_insert/relation_type.py | 19 +++++++++----------
 8 files changed, 45 insertions(+), 52 deletions(-)
 delete mode 100644 bulk_insert/module_vars.py

diff --git a/.gitignore b/.gitignore
index 89c4502..db3e642 100644
--- a/.gitignore
+++ b/.gitignore
@@ -29,3 +29,4 @@ wheels/
 *.spec
 .vscode
 
+datasets
diff --git a/bulk_insert/__init__.py b/bulk_insert/__init__.py
index 214e198..c3b1e92 100644
--- a/bulk_insert/__init__.py
+++ b/bulk_insert/__init__.py
@@ -7,5 +7,4 @@
         CSVError,
         SchemaError
 )
-#  from .module_vars import *
 #  from .entity_file import EntityFile
diff --git a/bulk_insert/bulk_insert.py b/bulk_insert/bulk_insert.py
index 08e603a..38150e5 100644
--- a/bulk_insert/bulk_insert.py
+++ b/bulk_insert/bulk_insert.py
@@ -1,5 +1,4 @@
 import sys
-import json
 from timeit import default_timer as timer
 import redis
 import click
@@ -7,14 +6,13 @@
 from query_buffer import QueryBuffer
 from label import Label
 from relation_type import RelationType
-import module_vars
 
 
-def parse_schemas(cls, csvs):
+def parse_schemas(cls, query_buf, csvs):
     schemas = [None] * len(csvs)
     for idx, in_csv in enumerate(csvs):
         # Build entity descriptor from input CSV
-        schemas[idx] = cls(in_csv)
+        schemas[idx] = cls(in_csv, query_buf)
     return schemas
 
 
@@ -25,13 +23,13 @@ def process_entities(entities):
         entity.process_entities()
         added_size = entity.binary_size
         # Check to see if the addition of this data will exceed the buffer's capacity
-        if (module_vars.QUERY_BUF.buffer_size + added_size >= Configs.max_buffer_size
-                or module_vars.QUERY_BUF.redis_token_count + len(entity.binary_entities) >= Configs.max_token_count):
+        if (self.query_buf.QUERY_BUF.buffer_size + added_size >= Configs.max_buffer_size
+                or self.query_buf.QUERY_BUF.redis_token_count + len(entity.binary_entities) >= Configs.max_token_count):
             # Send and flush the buffer if appropriate
-            module_vars.QUERY_BUF.send_buffer()
+            self.query_buf.QUERY_BUF.send_buffer()
         # Add binary data to list and update all counts
-        module_vars.QUERY_BUF.redis_token_count += len(entity.binary_entities)
-        module_vars.QUERY_BUF.buffer_size += added_size
+        self.query_buf.QUERY_BUF.redis_token_count += len(entity.binary_entities)
+        self.query_buf.QUERY_BUF.buffer_size += added_size
 
 # Command-line arguments
 @click.command()
@@ -82,17 +80,11 @@ def bulk_insert(graph, host, port, password, nodes, relations, separator, max_to
         print("Graph with name '%s', could not be created, as Redis key '%s' already exists." % (graph, graph))
         sys.exit(1)
 
-    # Read the header rows of each input CSV and save its schema.
-    labels = parse_schemas(Label, nodes)
-    reltypes = parse_schemas(RelationType, relations)
-
-    module_vars.QUERY_BUF = QueryBuffer(graph, client)
+    query_buf = QueryBuffer(graph, client, relations is not None)
 
-    # Create a node dictionary if we're building relations and as such require unique identifiers
-    if relations:
-        module_vars.NODE_DICT = {}
-    else:
-        module_vars.NODE_DICT = None
+    # Read the header rows of each input CSV and save its schema.
+    labels = parse_schemas(Label, query_buf, nodes)
+    reltypes = parse_schemas(RelationType, query_buf, relations)
 
     process_entities(labels)
 
@@ -100,10 +92,10 @@ def bulk_insert(graph, host, port, password, nodes, relations, separator, max_to
         process_entities(reltypes)
 
     # Send all remaining tokens to Redis
-    module_vars.QUERY_BUF.send_buffer()
+    query_buf.send_buffer()
 
     end_time = timer()
-    module_vars.QUERY_BUF.report_completion(end_time - start_time)
+    query_buf.report_completion(end_time - start_time)
 
 
 if __name__ == '__main__':
diff --git a/bulk_insert/entity_file.py b/bulk_insert/entity_file.py
index 27c76f2..c119a42 100644
--- a/bulk_insert/entity_file.py
+++ b/bulk_insert/entity_file.py
@@ -55,7 +55,9 @@ def prop_to_binary(prop_val, prop_type):
 
 # Superclass for label and relation CSV files
 class EntityFile(object):
-    def __init__(self, filename):
+    def __init__(self, query_buf, filename):
+        self.query_buf = query_buf
+
         # The label or relation type string is the basename of the file
         self.entity_str = os.path.splitext(os.path.basename(filename))[0]
         # Input file handling
diff --git a/bulk_insert/label.py b/bulk_insert/label.py
index 0643ea9..7ab3e5b 100644
--- a/bulk_insert/label.py
+++ b/bulk_insert/label.py
@@ -3,14 +3,13 @@
 from entity_file import EntityFile
 from configs import Configs
 from exceptions import SchemaError
-import module_vars
 from schema import Type
 
 
 # Handler class for processing label csv files.
 class Label(EntityFile):
-    def __init__(self, infile):
-        super(Label, self).__init__(infile)
+    def __init__(self, query_buf, infile):
+        super(Label, self).__init__(infile, query_buf)
         # Verify that exactly one field is labeled ID.
         if self.types.count(Type.ID) != 1:
             raise SchemaError("Node file '%s' should have exactly one ID column."
@@ -22,29 +21,29 @@ def process_entities(self):
             for row in reader:
                 self.validate_row(row)
                 # Add identifier->ID pair to dictionary if we are building relations
-                if module_vars.NODE_DICT is not None:
-                    if row[0] in module_vars.NODE_DICT:
+                if self.query_buf.nodes is not None:
+                    if row[0] in self.query_buf.nodes:
                         sys.stderr.write("Node identifier '%s' was used multiple times - second occurrence at %s:%d\n"
                                          % (row[0], self.infile.name, self.reader.line_num))
                         if Configs.skip_invalid_nodes is False:
                             sys.exit(1)
-                    module_vars.NODE_DICT[row[0]] = module_vars.TOP_NODE_ID
-                    module_vars.TOP_NODE_ID += 1
+                    self.query_buf.nodes[row[0]] = self.query_buf.top_node_id
+                    self.query_buf.top_node_id += 1
                 row_binary = self.pack_props(row)
                 row_binary_len = len(row_binary)
                 # If the addition of this entity will make the binary token grow too large,
                 # send the buffer now.
                 if self.binary_size + row_binary_len > Configs.max_token_size:
-                    module_vars.QUERY_BUF.labels.append(self.to_binary())
-                    module_vars.QUERY_BUF.send_buffer()
+                    self.query_buf.QUERY_BUF.labels.append(self.to_binary())
+                    self.query_buf.QUERY_BUF.send_buffer()
                     self.reset_partial_binary()
                     # Push the label onto the query buffer again, as there are more entities to process.
-                    module_vars.QUERY_BUF.labels.append(self.to_binary())
+                    self.query_buf.QUERY_BUF.labels.append(self.to_binary())
 
-                module_vars.QUERY_BUF.node_count += 1
+                self.query_buf.QUERY_BUF.node_count += 1
                 entities_created += 1
                 self.binary_size += row_binary_len
                 self.binary_entities.append(row_binary)
-            module_vars.QUERY_BUF.labels.append(self.to_binary())
+            self.query_buf.QUERY_BUF.labels.append(self.to_binary())
         self.infile.close()
         print("%d nodes created with label '%s'" % (entities_created, self.entity_str))
diff --git a/bulk_insert/module_vars.py b/bulk_insert/module_vars.py
deleted file mode 100644
index 2b73a30..0000000
--- a/bulk_insert/module_vars.py
+++ /dev/null
@@ -1,6 +0,0 @@
-# TODO Get rid of this whole thing
-
-# Global variables
-NODE_DICT = {}         # global node dictionary
-TOP_NODE_ID = 0        # next ID to assign to a node
-QUERY_BUF = None       # Buffer for query being constructed
diff --git a/bulk_insert/query_buffer.py b/bulk_insert/query_buffer.py
index 924acac..26740af 100644
--- a/bulk_insert/query_buffer.py
+++ b/bulk_insert/query_buffer.py
@@ -1,6 +1,9 @@
 # QueryBuffer is the class that processes input CSVs and emits their binary formats to the Redis client.
 class QueryBuffer(object):
-    def __init__(self, graphname, client):
+    nodes = None
+    top_node_id = 0
+
+    def __init__(self, graphname, client, has_relations):
         # Redis client and data for each query
         self.client = client
 
@@ -21,6 +24,10 @@ def __init__(self, graphname, client):
         self.nodes_created = 0 # Total number of nodes created
         self.relations_created = 0 # Total number of relations created
 
+        # Create a node dictionary if we're building relations and as such require unique identifiers
+        if has_relations:
+            self.nodes = {}
+
     # Send all pending inserts to Redis
     def send_buffer(self):
         # Do nothing if we have no entities
diff --git a/bulk_insert/relation_type.py b/bulk_insert/relation_type.py
index 2e225dc..21630df 100644
--- a/bulk_insert/relation_type.py
+++ b/bulk_insert/relation_type.py
@@ -2,15 +2,14 @@
 import click
 from entity_file import EntityFile
 from exceptions import CSVError, SchemaError
-import module_vars
 from configs import Configs
 from schema import Type
 
 
 # Handler class for processing relation csv files.
 class RelationType(EntityFile):
-    def __init__(self, infile):
-        super(RelationType, self).__init__(infile)
+    def __init__(self, query_buf, infile):
+        super(RelationType, self).__init__(infile, query_buf)
         if self.column_count < 2:
             raise CSVError("Relation file '%s' should have at least 2 elements in header line."
                            % (infile.name))
@@ -37,8 +36,8 @@ def process_entities(self):
             for row in reader:
                 self.validate_row(row)
                 try:
-                    src = module_vars.NODE_DICT[row[self.start_id]]
-                    dest = module_vars.NODE_DICT[row[self.end_id]]
+                    src = self.query_buf.nodes[row[self.start_id]]
+                    dest = self.query_buf.nodes[row[self.end_id]]
                 except KeyError as e:
                     print("Relationship specified a non-existent identifier. src: %s; dest: %s" % (row[self.start_id], row[self.end_id]))
                     if Configs.skip_invalid_edges is False:
@@ -50,16 +49,16 @@ def process_entities(self):
                 # If the addition of this entity will make the binary token grow too large,
                 # send the buffer now.
                 if self.binary_size + row_binary_len > Configs.max_token_size:
-                    module_vars.QUERY_BUF.reltypes.append(self.to_binary())
-                    module_vars.QUERY_BUF.send_buffer()
+                    self.query_buf.QUERY_BUF.reltypes.append(self.to_binary())
+                    self.query_buf.QUERY_BUF.send_buffer()
                     self.reset_partial_binary()
                     # Push the reltype onto the query buffer again, as there are more entities to process.
-                    module_vars.QUERY_BUF.reltypes.append(self.to_binary())
+                    self.query_buf.QUERY_BUF.reltypes.append(self.to_binary())
 
-                module_vars.QUERY_BUF.relation_count += 1
+                self.query_buf.QUERY_BUF.relation_count += 1
                 entities_created += 1
                 self.binary_size += row_binary_len
                 self.binary_entities.append(row_binary)
-            module_vars.QUERY_BUF.reltypes.append(self.to_binary())
+            self.query_buf.QUERY_BUF.reltypes.append(self.to_binary())
         self.infile.close()
         print("%d relations created for type '%s'" % (entities_created, self.entity_str))

From e0243b167142f44c46aea1a6e1daa240af257841 Mon Sep 17 00:00:00 2001
From: Jeffrey Lovitz <jeffrey.lovitz@gmail.com>
Date: Fri, 20 Dec 2019 19:26:32 -0500
Subject: [PATCH 08/15] trying to make configs global

---
 bulk_insert/bulk_insert.py   | 18 +++++++++---------
 bulk_insert/configs.py       | 11 ++++++-----
 bulk_insert/entity_file.py   |  4 +++-
 bulk_insert/label.py         | 13 +++++++------
 bulk_insert/relation_type.py | 18 ++++++++++--------
 5 files changed, 35 insertions(+), 29 deletions(-)

diff --git a/bulk_insert/bulk_insert.py b/bulk_insert/bulk_insert.py
index 38150e5..661c3cb 100644
--- a/bulk_insert/bulk_insert.py
+++ b/bulk_insert/bulk_insert.py
@@ -18,18 +18,18 @@ def parse_schemas(cls, query_buf, csvs):
 
 # For each input file, validate contents and convert to binary format.
 # If any buffer limits have been reached, flush all enqueued inserts to Redis.
-def process_entities(entities):
+def process_entities(entities, query_buf):
     for entity in entities:
         entity.process_entities()
         added_size = entity.binary_size
         # Check to see if the addition of this data will exceed the buffer's capacity
-        if (self.query_buf.QUERY_BUF.buffer_size + added_size >= Configs.max_buffer_size
-                or self.query_buf.QUERY_BUF.redis_token_count + len(entity.binary_entities) >= Configs.max_token_count):
+        if (query_buf.buffer_size + added_size >= Configs.max_buffer_size
+                or query_buf.redis_token_count + len(entity.binary_entities) >= Configs.max_token_count):
             # Send and flush the buffer if appropriate
-            self.query_buf.QUERY_BUF.send_buffer()
+            query_buf.send_buffer()
         # Add binary data to list and update all counts
-        self.query_buf.QUERY_BUF.redis_token_count += len(entity.binary_entities)
-        self.query_buf.QUERY_BUF.buffer_size += added_size
+        query_buf.redis_token_count += len(entity.binary_entities)
+        query_buf.buffer_size += added_size
 
 # Command-line arguments
 @click.command()
@@ -54,7 +54,7 @@ def bulk_insert(graph, host, port, password, nodes, relations, separator, max_to
         raise Exception("Python 3 is required for the RedisGraph bulk loader.")
 
     # Initialize configurations with command-line arguments
-    Configs(max_token_count, max_buffer_size, max_token_size, skip_invalid_nodes, skip_invalid_edges, separator, int(quote))
+    c = Configs(max_token_count, max_buffer_size, max_token_size, skip_invalid_nodes, skip_invalid_edges, separator, int(quote))
 
     start_time = timer()
     # Attempt to connect to Redis server
@@ -86,10 +86,10 @@ def bulk_insert(graph, host, port, password, nodes, relations, separator, max_to
     labels = parse_schemas(Label, query_buf, nodes)
     reltypes = parse_schemas(RelationType, query_buf, relations)
 
-    process_entities(labels)
+    process_entities(labels, query_buf)
 
     if relations:
-        process_entities(reltypes)
+        process_entities(reltypes, query_buf)
 
     # Send all remaining tokens to Redis
     query_buf.send_buffer()
diff --git a/bulk_insert/configs.py b/bulk_insert/configs.py
index 79fd0d7..84055ed 100644
--- a/bulk_insert/configs.py
+++ b/bulk_insert/configs.py
@@ -1,5 +1,9 @@
-# User-configurable thresholds for when to send queries to Redis
-class Configs(object):
+from dataclasses import dataclass
+
+
+#  @dataclass(frozen=True)
+@dataclass
+class Configs:
     max_token_count = 1024 * 1023
     max_buffer_size = 0
     max_token_size = 512 * 1000000
@@ -7,7 +11,6 @@ class Configs(object):
     skip_invalid_edges = False
     separator = ','
     quoting = 3
-    top_node_id = 0 # reset global ID variable (in case we are calling bulk_insert from unit tests) # TODO del
 
     def __init__(self, max_token_count, max_buffer_size, max_token_size, skip_invalid_nodes, skip_invalid_edges, separator, quoting):
         # Maximum number of tokens per query
@@ -27,5 +30,3 @@ def __init__(self, max_token_count, max_buffer_size, max_token_size, skip_invali
         self.separator = separator
 
         self.quoting = quoting
-
-        self.top_node_id = 0 # reset global ID variable (in case we are calling bulk_insert from unit tests) # TODO del
diff --git a/bulk_insert/entity_file.py b/bulk_insert/entity_file.py
index c119a42..ddffe7b 100644
--- a/bulk_insert/entity_file.py
+++ b/bulk_insert/entity_file.py
@@ -64,6 +64,8 @@ def __init__(self, query_buf, filename):
         self.infile = io.open(filename, 'rt')
         # Initialize CSV reader that ignores leading whitespace in each field
         # and does not modify input quote characters
+        import ipdb
+        ipdb.set_trace()
         self.reader = csv.reader(self.infile, delimiter=Configs.separator, skipinitialspace=True, quoting=Configs.quoting)
 
         self.packed_header = b''
@@ -89,7 +91,7 @@ def validate_row(self, row):
         # Each row should have the same number of fields
         if len(row) != self.column_count:
             raise CSVError("%s:%d Expected %d columns, encountered %d ('%s')"
-                           % (self.infile.name, self.reader.line_num, self.column_count, len(row), configs.separator.join(row)))
+                           % (self.infile.name, self.reader.line_num, self.column_count, len(row), Configs.separator.join(row)))
 
     # If part of a CSV file was sent to Redis, delete the processed entities and update the binary size
     def reset_partial_binary(self):
diff --git a/bulk_insert/label.py b/bulk_insert/label.py
index 7ab3e5b..672be04 100644
--- a/bulk_insert/label.py
+++ b/bulk_insert/label.py
@@ -1,7 +1,8 @@
 import sys
 import click
 from entity_file import EntityFile
-from configs import Configs
+#  from configs import Configs
+import configs
 from exceptions import SchemaError
 from schema import Type
 
@@ -34,16 +35,16 @@ def process_entities(self):
                 # If the addition of this entity will make the binary token grow too large,
                 # send the buffer now.
                 if self.binary_size + row_binary_len > Configs.max_token_size:
-                    self.query_buf.QUERY_BUF.labels.append(self.to_binary())
-                    self.query_buf.QUERY_BUF.send_buffer()
+                    self.query_buf.labels.append(self.to_binary())
+                    self.query_buf.send_buffer()
                     self.reset_partial_binary()
                     # Push the label onto the query buffer again, as there are more entities to process.
-                    self.query_buf.QUERY_BUF.labels.append(self.to_binary())
+                    self.query_buf.labels.append(self.to_binary())
 
-                self.query_buf.QUERY_BUF.node_count += 1
+                self.query_buf.node_count += 1
                 entities_created += 1
                 self.binary_size += row_binary_len
                 self.binary_entities.append(row_binary)
-            self.query_buf.QUERY_BUF.labels.append(self.to_binary())
+            self.query_buf.labels.append(self.to_binary())
         self.infile.close()
         print("%d nodes created with label '%s'" % (entities_created, self.entity_str))
diff --git a/bulk_insert/relation_type.py b/bulk_insert/relation_type.py
index 21630df..e77f2e3 100644
--- a/bulk_insert/relation_type.py
+++ b/bulk_insert/relation_type.py
@@ -2,7 +2,7 @@
 import click
 from entity_file import EntityFile
 from exceptions import CSVError, SchemaError
-from configs import Configs
+#  from configs import configs
 from schema import Type
 
 
@@ -40,7 +40,9 @@ def process_entities(self):
                     dest = self.query_buf.nodes[row[self.end_id]]
                 except KeyError as e:
                     print("Relationship specified a non-existent identifier. src: %s; dest: %s" % (row[self.start_id], row[self.end_id]))
-                    if Configs.skip_invalid_edges is False:
+                    import ipdb
+                    ipdb.set_trace()
+                    if configs.skip_invalid_edges is False:
                         raise e
                     continue
                 fmt = "=QQ" # 8-byte unsigned ints for src and dest
@@ -48,17 +50,17 @@ def process_entities(self):
                 row_binary_len = len(row_binary)
                 # If the addition of this entity will make the binary token grow too large,
                 # send the buffer now.
-                if self.binary_size + row_binary_len > Configs.max_token_size:
-                    self.query_buf.QUERY_BUF.reltypes.append(self.to_binary())
-                    self.query_buf.QUERY_BUF.send_buffer()
+                if self.binary_size + row_binary_len > configs.max_token_size:
+                    self.query_buf.reltypes.append(self.to_binary())
+                    self.query_buf.send_buffer()
                     self.reset_partial_binary()
                     # Push the reltype onto the query buffer again, as there are more entities to process.
-                    self.query_buf.QUERY_BUF.reltypes.append(self.to_binary())
+                    self.query_buf.reltypes.append(self.to_binary())
 
-                self.query_buf.QUERY_BUF.relation_count += 1
+                self.query_buf.relation_count += 1
                 entities_created += 1
                 self.binary_size += row_binary_len
                 self.binary_entities.append(row_binary)
-            self.query_buf.QUERY_BUF.reltypes.append(self.to_binary())
+            self.query_buf.reltypes.append(self.to_binary())
         self.infile.close()
         print("%d relations created for type '%s'" % (entities_created, self.entity_str))

From 29c0bd2aa99e6fc4c6ef6955c3ddf1fdcfe45188 Mon Sep 17 00:00:00 2001
From: Jeffrey Lovitz <jeffrey.lovitz@gmail.com>
Date: Fri, 20 Dec 2019 19:41:46 -0500
Subject: [PATCH 09/15] Configs fixed

---
 bulk_insert/__init__.py      |  1 -
 bulk_insert/bulk_insert.py   | 25 ++++++++++++++++++----
 bulk_insert/configs.py       | 41 ++++++++----------------------------
 bulk_insert/entity_file.py   | 10 +++------
 bulk_insert/label.py         |  6 +++---
 bulk_insert/relation_type.py |  4 +---
 6 files changed, 37 insertions(+), 50 deletions(-)

diff --git a/bulk_insert/__init__.py b/bulk_insert/__init__.py
index c3b1e92..0051d79 100644
--- a/bulk_insert/__init__.py
+++ b/bulk_insert/__init__.py
@@ -1,4 +1,3 @@
-from .configs import Configs
 from .label import Label
 from .relation_type import RelationType
 from .schema import *
diff --git a/bulk_insert/bulk_insert.py b/bulk_insert/bulk_insert.py
index 661c3cb..98fcce5 100644
--- a/bulk_insert/bulk_insert.py
+++ b/bulk_insert/bulk_insert.py
@@ -2,7 +2,7 @@
 from timeit import default_timer as timer
 import redis
 import click
-from configs import Configs
+import configs
 from query_buffer import QueryBuffer
 from label import Label
 from relation_type import RelationType
@@ -23,14 +23,31 @@ def process_entities(entities, query_buf):
         entity.process_entities()
         added_size = entity.binary_size
         # Check to see if the addition of this data will exceed the buffer's capacity
-        if (query_buf.buffer_size + added_size >= Configs.max_buffer_size
-                or query_buf.redis_token_count + len(entity.binary_entities) >= Configs.max_token_count):
+        if (query_buf.buffer_size + added_size >= configs.max_buffer_size
+                or query_buf.redis_token_count + len(entity.binary_entities) >= configs.max_token_count):
             # Send and flush the buffer if appropriate
             query_buf.send_buffer()
         # Add binary data to list and update all counts
         query_buf.redis_token_count += len(entity.binary_entities)
         query_buf.buffer_size += added_size
 
+def Config_Set(max_token_count, max_buffer_size, max_token_size, skip_invalid_nodes, skip_invalid_edges, separator, quoting):
+    # Maximum number of tokens per query
+    # 1024 * 1024 is the hard-coded Redis maximum. We'll set a slightly lower limit so
+    # that we can safely ignore tokens that aren't binary strings
+    # ("GRAPH.BULK", "BEGIN", graph name, counts)
+    configs.max_token_count = min(max_token_count, 1024 * 1023)
+    # Maximum size in bytes per query
+    configs.max_buffer_size = max_buffer_size * 1000000
+    # Maximum size in bytes per token
+    # 512 megabytes is a hard-coded Redis maximum
+    configs.max_token_size = min(max_token_size * 1000000, 512 * 1000000)
+
+    configs.skip_invalid_nodes = skip_invalid_nodes
+    configs.skip_invalid_edges = skip_invalid_edges
+    configs.separator = separator
+    configs.quoting = quoting
+
 # Command-line arguments
 @click.command()
 @click.argument('graph')
@@ -54,7 +71,7 @@ def bulk_insert(graph, host, port, password, nodes, relations, separator, max_to
         raise Exception("Python 3 is required for the RedisGraph bulk loader.")
 
     # Initialize configurations with command-line arguments
-    c = Configs(max_token_count, max_buffer_size, max_token_size, skip_invalid_nodes, skip_invalid_edges, separator, int(quote))
+    Config_Set(max_token_count, max_buffer_size, max_token_size, skip_invalid_nodes, skip_invalid_edges, separator, int(quote))
 
     start_time = timer()
     # Attempt to connect to Redis server
diff --git a/bulk_insert/configs.py b/bulk_insert/configs.py
index 84055ed..183a350 100644
--- a/bulk_insert/configs.py
+++ b/bulk_insert/configs.py
@@ -1,32 +1,9 @@
-from dataclasses import dataclass
-
-
-#  @dataclass(frozen=True)
-@dataclass
-class Configs:
-    max_token_count = 1024 * 1023
-    max_buffer_size = 0
-    max_token_size = 512 * 1000000
-    skip_invalid_nodes = False
-    skip_invalid_edges = False
-    separator = ','
-    quoting = 3
-
-    def __init__(self, max_token_count, max_buffer_size, max_token_size, skip_invalid_nodes, skip_invalid_edges, separator, quoting):
-        # Maximum number of tokens per query
-        # 1024 * 1024 is the hard-coded Redis maximum. We'll set a slightly lower limit so
-        # that we can safely ignore tokens that aren't binary strings
-        # ("GRAPH.BULK", "BEGIN", graph name, counts)
-        self.max_token_count = min(max_token_count, 1024 * 1023)
-        # Maximum size in bytes per query
-        self.max_buffer_size = max_buffer_size * 1000000
-        # Maximum size in bytes per token
-        # 512 megabytes is a hard-coded Redis maximum
-        self.max_token_size = min(max_token_size * 1000000, 512 * 1000000)
-
-        self.skip_invalid_nodes = skip_invalid_nodes
-        self.skip_invalid_edges = skip_invalid_edges
-
-        self.separator = separator
-
-        self.quoting = quoting
+# Default values for command-line arguments
+
+max_token_count = 1024 * 1023
+max_buffer_size = 0
+max_token_size = 512 * 1000000
+skip_invalid_nodes = False
+skip_invalid_edges = False
+separator = ','
+quoting = 3
diff --git a/bulk_insert/entity_file.py b/bulk_insert/entity_file.py
index ddffe7b..626b050 100644
--- a/bulk_insert/entity_file.py
+++ b/bulk_insert/entity_file.py
@@ -3,7 +3,7 @@
 import csv
 import math
 import struct
-from configs import Configs
+import configs
 from exceptions import CSVError, SchemaError
 from schema import Type, convert_schema_type
 
@@ -47,8 +47,6 @@ def prop_to_binary(prop_val, prop_type):
         format_str += "%ds" % (len(encoded_str) + 1)
         return struct.pack(format_str, Type.STRING, encoded_str)
 
-    import ipdb
-    ipdb.set_trace()
     # If it hasn't returned by this point, it is trying to set it to a type that it can't adopt
     raise Exception("unable to parse [" + prop_val + "] with type ["+repr(type)+"]")
 
@@ -64,9 +62,7 @@ def __init__(self, query_buf, filename):
         self.infile = io.open(filename, 'rt')
         # Initialize CSV reader that ignores leading whitespace in each field
         # and does not modify input quote characters
-        import ipdb
-        ipdb.set_trace()
-        self.reader = csv.reader(self.infile, delimiter=Configs.separator, skipinitialspace=True, quoting=Configs.quoting)
+        self.reader = csv.reader(self.infile, delimiter=configs.separator, skipinitialspace=True, quoting=configs.quoting)
 
         self.packed_header = b''
         self.binary_entities = []
@@ -91,7 +87,7 @@ def validate_row(self, row):
         # Each row should have the same number of fields
         if len(row) != self.column_count:
             raise CSVError("%s:%d Expected %d columns, encountered %d ('%s')"
-                           % (self.infile.name, self.reader.line_num, self.column_count, len(row), Configs.separator.join(row)))
+                           % (self.infile.name, self.reader.line_num, self.column_count, len(row), configs.separator.join(row)))
 
     # If part of a CSV file was sent to Redis, delete the processed entities and update the binary size
     def reset_partial_binary(self):
diff --git a/bulk_insert/label.py b/bulk_insert/label.py
index 672be04..02af11a 100644
--- a/bulk_insert/label.py
+++ b/bulk_insert/label.py
@@ -1,7 +1,7 @@
 import sys
 import click
 from entity_file import EntityFile
-#  from configs import Configs
+#  from configs import configs
 import configs
 from exceptions import SchemaError
 from schema import Type
@@ -26,7 +26,7 @@ def process_entities(self):
                     if row[0] in self.query_buf.nodes:
                         sys.stderr.write("Node identifier '%s' was used multiple times - second occurrence at %s:%d\n"
                                          % (row[0], self.infile.name, self.reader.line_num))
-                        if Configs.skip_invalid_nodes is False:
+                        if configs.skip_invalid_nodes is False:
                             sys.exit(1)
                     self.query_buf.nodes[row[0]] = self.query_buf.top_node_id
                     self.query_buf.top_node_id += 1
@@ -34,7 +34,7 @@ def process_entities(self):
                 row_binary_len = len(row_binary)
                 # If the addition of this entity will make the binary token grow too large,
                 # send the buffer now.
-                if self.binary_size + row_binary_len > Configs.max_token_size:
+                if self.binary_size + row_binary_len > configs.max_token_size:
                     self.query_buf.labels.append(self.to_binary())
                     self.query_buf.send_buffer()
                     self.reset_partial_binary()
diff --git a/bulk_insert/relation_type.py b/bulk_insert/relation_type.py
index e77f2e3..3764c26 100644
--- a/bulk_insert/relation_type.py
+++ b/bulk_insert/relation_type.py
@@ -2,7 +2,7 @@
 import click
 from entity_file import EntityFile
 from exceptions import CSVError, SchemaError
-#  from configs import configs
+import configs
 from schema import Type
 
 
@@ -40,8 +40,6 @@ def process_entities(self):
                     dest = self.query_buf.nodes[row[self.end_id]]
                 except KeyError as e:
                     print("Relationship specified a non-existent identifier. src: %s; dest: %s" % (row[self.start_id], row[self.end_id]))
-                    import ipdb
-                    ipdb.set_trace()
                     if configs.skip_invalid_edges is False:
                         raise e
                     continue

From 757b7867b414df05ecd98daa26140eb83414489e Mon Sep 17 00:00:00 2001
From: Jeffrey Lovitz <jeffrey.lovitz@gmail.com>
Date: Thu, 26 Dec 2019 10:42:07 -0500
Subject: [PATCH 10/15] QueryBuffer singleton

---
 bulk_insert/bulk_insert.py   |  45 +++++++-----
 bulk_insert/entity_file.py   |   3 +-
 bulk_insert/label.py         |  26 +++----
 bulk_insert/query_buffer.py  | 135 +++++++++++++++++++----------------
 bulk_insert/relation_type.py |  21 +++---
 5 files changed, 126 insertions(+), 104 deletions(-)

diff --git a/bulk_insert/bulk_insert.py b/bulk_insert/bulk_insert.py
index 98fcce5..bbd6dc9 100644
--- a/bulk_insert/bulk_insert.py
+++ b/bulk_insert/bulk_insert.py
@@ -3,33 +3,34 @@
 import redis
 import click
 import configs
-from query_buffer import QueryBuffer
+import query_buffer as QueryBuffer
 from label import Label
 from relation_type import RelationType
 
 
-def parse_schemas(cls, query_buf, csvs):
+def parse_schemas(cls, csvs):
     schemas = [None] * len(csvs)
     for idx, in_csv in enumerate(csvs):
         # Build entity descriptor from input CSV
-        schemas[idx] = cls(in_csv, query_buf)
+        schemas[idx] = cls(in_csv)
     return schemas
 
 
 # For each input file, validate contents and convert to binary format.
 # If any buffer limits have been reached, flush all enqueued inserts to Redis.
-def process_entities(entities, query_buf):
+def process_entities(entities):
     for entity in entities:
         entity.process_entities()
         added_size = entity.binary_size
         # Check to see if the addition of this data will exceed the buffer's capacity
-        if (query_buf.buffer_size + added_size >= configs.max_buffer_size
-                or query_buf.redis_token_count + len(entity.binary_entities) >= configs.max_token_count):
+        if (QueryBuffer.buffer_size + added_size >= configs.max_buffer_size
+                or QueryBuffer.redis_token_count + len(entity.binary_entities) >= configs.max_token_count):
             # Send and flush the buffer if appropriate
-            query_buf.send_buffer()
+            QueryBuffer.send_buffer()
         # Add binary data to list and update all counts
-        query_buf.redis_token_count += len(entity.binary_entities)
-        query_buf.buffer_size += added_size
+        QueryBuffer.redis_token_count += len(entity.binary_entities)
+        QueryBuffer.buffer_size += added_size
+
 
 def Config_Set(max_token_count, max_buffer_size, max_token_size, skip_invalid_nodes, skip_invalid_edges, separator, quoting):
     # Maximum number of tokens per query
@@ -48,6 +49,18 @@ def Config_Set(max_token_count, max_buffer_size, max_token_size, skip_invalid_no
     configs.separator = separator
     configs.quoting = quoting
 
+
+def QueryBuf_Set(graphname, client, has_relations):
+    QueryBuffer.graphname = graphname
+
+    # Redis client and data for each query
+    QueryBuffer.client = client
+
+    # Create a node dictionary if we're building relations and as such require unique identifiers
+    if has_relations:
+        QueryBuffer.nodes = {}
+
+
 # Command-line arguments
 @click.command()
 @click.argument('graph')
@@ -97,22 +110,22 @@ def bulk_insert(graph, host, port, password, nodes, relations, separator, max_to
         print("Graph with name '%s', could not be created, as Redis key '%s' already exists." % (graph, graph))
         sys.exit(1)
 
-    query_buf = QueryBuffer(graph, client, relations is not None)
+    QueryBuf_Set(graph, client, relations is not None)
 
     # Read the header rows of each input CSV and save its schema.
-    labels = parse_schemas(Label, query_buf, nodes)
-    reltypes = parse_schemas(RelationType, query_buf, relations)
+    labels = parse_schemas(Label, nodes)
+    reltypes = parse_schemas(RelationType, relations)
 
-    process_entities(labels, query_buf)
+    process_entities(labels)
 
     if relations:
-        process_entities(reltypes, query_buf)
+        process_entities(reltypes)
 
     # Send all remaining tokens to Redis
-    query_buf.send_buffer()
+    QueryBuffer.send_buffer()
 
     end_time = timer()
-    query_buf.report_completion(end_time - start_time)
+    QueryBuffer.report_completion(end_time - start_time)
 
 
 if __name__ == '__main__':
diff --git a/bulk_insert/entity_file.py b/bulk_insert/entity_file.py
index 626b050..fdedea2 100644
--- a/bulk_insert/entity_file.py
+++ b/bulk_insert/entity_file.py
@@ -53,8 +53,7 @@ def prop_to_binary(prop_val, prop_type):
 
 # Superclass for label and relation CSV files
 class EntityFile(object):
-    def __init__(self, query_buf, filename):
-        self.query_buf = query_buf
+    def __init__(self, filename):
 
         # The label or relation type string is the basename of the file
         self.entity_str = os.path.splitext(os.path.basename(filename))[0]
diff --git a/bulk_insert/label.py b/bulk_insert/label.py
index 02af11a..1a0d40d 100644
--- a/bulk_insert/label.py
+++ b/bulk_insert/label.py
@@ -1,16 +1,16 @@
 import sys
 import click
-from entity_file import EntityFile
-#  from configs import configs
 import configs
+import query_buffer as QueryBuffer
+from entity_file import EntityFile
 from exceptions import SchemaError
 from schema import Type
 
 
 # Handler class for processing label csv files.
 class Label(EntityFile):
-    def __init__(self, query_buf, infile):
-        super(Label, self).__init__(infile, query_buf)
+    def __init__(self, infile):
+        super(Label, self).__init__(infile)
         # Verify that exactly one field is labeled ID.
         if self.types.count(Type.ID) != 1:
             raise SchemaError("Node file '%s' should have exactly one ID column."
@@ -22,29 +22,29 @@ def process_entities(self):
             for row in reader:
                 self.validate_row(row)
                 # Add identifier->ID pair to dictionary if we are building relations
-                if self.query_buf.nodes is not None:
-                    if row[0] in self.query_buf.nodes:
+                if QueryBuffer.nodes is not None:
+                    if row[0] in QueryBuffer.nodes:
                         sys.stderr.write("Node identifier '%s' was used multiple times - second occurrence at %s:%d\n"
                                          % (row[0], self.infile.name, self.reader.line_num))
                         if configs.skip_invalid_nodes is False:
                             sys.exit(1)
-                    self.query_buf.nodes[row[0]] = self.query_buf.top_node_id
-                    self.query_buf.top_node_id += 1
+                    QueryBuffer.nodes[row[0]] = QueryBuffer.top_node_id
+                    QueryBuffer.top_node_id += 1
                 row_binary = self.pack_props(row)
                 row_binary_len = len(row_binary)
                 # If the addition of this entity will make the binary token grow too large,
                 # send the buffer now.
                 if self.binary_size + row_binary_len > configs.max_token_size:
-                    self.query_buf.labels.append(self.to_binary())
-                    self.query_buf.send_buffer()
+                    QueryBuffer.labels.append(self.to_binary())
+                    QueryBuffer.send_buffer()
                     self.reset_partial_binary()
                     # Push the label onto the query buffer again, as there are more entities to process.
-                    self.query_buf.labels.append(self.to_binary())
+                    QueryBuffer.labels.append(self.to_binary())
 
-                self.query_buf.node_count += 1
+                QueryBuffer.node_count += 1
                 entities_created += 1
                 self.binary_size += row_binary_len
                 self.binary_entities.append(row_binary)
-            self.query_buf.labels.append(self.to_binary())
+            QueryBuffer.labels.append(self.to_binary())
         self.infile.close()
         print("%d nodes created with label '%s'" % (entities_created, self.entity_str))
diff --git a/bulk_insert/query_buffer.py b/bulk_insert/query_buffer.py
index 26740af..88e132d 100644
--- a/bulk_insert/query_buffer.py
+++ b/bulk_insert/query_buffer.py
@@ -1,63 +1,72 @@
-# QueryBuffer is the class that processes input CSVs and emits their binary formats to the Redis client.
-class QueryBuffer(object):
-    nodes = None
-    top_node_id = 0
-
-    def __init__(self, graphname, client, has_relations):
-        # Redis client and data for each query
-        self.client = client
-
-        # Sizes for buffer currently being constructed
-        self.redis_token_count = 0
-        self.buffer_size = 0
-
-        # The first query should include a "BEGIN" token
-        self.graphname = graphname
-        self.initial_query = True
-
-        self.node_count = 0
-        self.relation_count = 0
-
-        self.labels = [] # List containing all pending Label objects
-        self.reltypes = [] # List containing all pending RelationType objects
-
-        self.nodes_created = 0 # Total number of nodes created
-        self.relations_created = 0 # Total number of relations created
-
-        # Create a node dictionary if we're building relations and as such require unique identifiers
-        if has_relations:
-            self.nodes = {}
-
-    # Send all pending inserts to Redis
-    def send_buffer(self):
-        # Do nothing if we have no entities
-        if self.node_count == 0 and self.relation_count == 0:
-            return
-
-        args = [self.node_count, self.relation_count, len(self.labels), len(self.reltypes)] + self.labels + self.reltypes
-        # Prepend a "BEGIN" token if this is the first query
-        if self.initial_query:
-            args.insert(0, "BEGIN")
-            self.initial_query = False
-
-        result = self.client.execute_command("GRAPH.BULK", self.graphname, *args)
-        stats = result.split(', '.encode())
-        self.nodes_created += int(stats[0].split(' '.encode())[0])
-        self.relations_created += int(stats[1].split(' '.encode())[0])
-
-        self.clear_buffer()
-
-    # Delete all entities that have been inserted
-    def clear_buffer(self):
-        self.redis_token_count = 0
-        self.buffer_size = 0
-
-        # All constructed entities have been inserted, so clear buffers
-        self.node_count = 0
-        self.relation_count = 0
-        del self.labels[:]
-        del self.reltypes[:]
-
-    def report_completion(self, runtime):
-        print("Construction of graph '%s' complete: %d nodes created, %d relations created in %f seconds"
-              % (self.graphname, self.nodes_created, self.relations_created, runtime))
+# QueryBuffer is the module that processes input CSVs and emits their binary formats to the Redis client.
+
+nodes = None
+top_node_id = 0
+
+# Redis client and data for each query
+client = None
+
+# Sizes for buffer currently being constructed
+redis_token_count = 0
+buffer_size = 0
+
+# The first query should include a "BEGIN" token
+graphname = ""
+initial_query = True
+
+node_count = 0
+relation_count = 0
+
+labels = [] # List containing all pending Label objects
+reltypes = [] # List containing all pending RelationType objects
+
+nodes_created = 0 # Total number of nodes created
+relations_created = 0 # Total number of relations created
+
+
+# Send all pending inserts to Redis
+def send_buffer():
+    global initial_query
+    global nodes_created
+    global relations_created
+
+    # Do nothing if we have no entities
+    if node_count == 0 and relation_count == 0:
+        return
+
+    args = [node_count, relation_count, len(labels), len(reltypes)] + labels + reltypes
+    # Prepend a "BEGIN" token if this is the first query
+    if initial_query:
+        args.insert(0, "BEGIN")
+        initial_query = False
+
+    result = client.execute_command("GRAPH.BULK", graphname, *args)
+    stats = result.split(', '.encode())
+    nodes_created += int(stats[0].split(' '.encode())[0])
+    relations_created += int(stats[1].split(' '.encode())[0])
+
+    clear_buffer()
+
+
+# Delete all entities that have been inserted
+def clear_buffer():
+    global redis_token_count
+    global buffer_size
+    global node_count
+    global relation_count
+    global labels
+    global reltypes
+
+    redis_token_count = 0
+    buffer_size = 0
+
+    # All constructed entities have been inserted, so clear buffers
+    node_count = 0
+    relation_count = 0
+    del labels[:]
+    del reltypes[:]
+
+
+def report_completion(runtime):
+    print("Construction of graph '%s' complete: %d nodes created, %d relations created in %f seconds"
+          % (graphname, nodes_created, relations_created, runtime))
diff --git a/bulk_insert/relation_type.py b/bulk_insert/relation_type.py
index 3764c26..793dad8 100644
--- a/bulk_insert/relation_type.py
+++ b/bulk_insert/relation_type.py
@@ -1,15 +1,16 @@
 import struct
 import click
+import configs
+import query_buffer as QueryBuffer
 from entity_file import EntityFile
 from exceptions import CSVError, SchemaError
-import configs
 from schema import Type
 
 
 # Handler class for processing relation csv files.
 class RelationType(EntityFile):
-    def __init__(self, query_buf, infile):
-        super(RelationType, self).__init__(infile, query_buf)
+    def __init__(self, infile):
+        super(RelationType, self).__init__(infile)
         if self.column_count < 2:
             raise CSVError("Relation file '%s' should have at least 2 elements in header line."
                            % (infile.name))
@@ -36,8 +37,8 @@ def process_entities(self):
             for row in reader:
                 self.validate_row(row)
                 try:
-                    src = self.query_buf.nodes[row[self.start_id]]
-                    dest = self.query_buf.nodes[row[self.end_id]]
+                    src = QueryBuffer.nodes[row[self.start_id]]
+                    dest = QueryBuffer.nodes[row[self.end_id]]
                 except KeyError as e:
                     print("Relationship specified a non-existent identifier. src: %s; dest: %s" % (row[self.start_id], row[self.end_id]))
                     if configs.skip_invalid_edges is False:
@@ -49,16 +50,16 @@ def process_entities(self):
                 # If the addition of this entity will make the binary token grow too large,
                 # send the buffer now.
                 if self.binary_size + row_binary_len > configs.max_token_size:
-                    self.query_buf.reltypes.append(self.to_binary())
-                    self.query_buf.send_buffer()
+                    QueryBuffer.reltypes.append(self.to_binary())
+                    QueryBuffer.send_buffer()
                     self.reset_partial_binary()
                     # Push the reltype onto the query buffer again, as there are more entities to process.
-                    self.query_buf.reltypes.append(self.to_binary())
+                    QueryBuffer.reltypes.append(self.to_binary())
 
-                self.query_buf.relation_count += 1
+                QueryBuffer.relation_count += 1
                 entities_created += 1
                 self.binary_size += row_binary_len
                 self.binary_entities.append(row_binary)
-            self.query_buf.reltypes.append(self.to_binary())
+            QueryBuffer.reltypes.append(self.to_binary())
         self.infile.close()
         print("%d relations created for type '%s'" % (entities_created, self.entity_str))

From 13746141728f0bc3f223abaa7b110d1da4756e7d Mon Sep 17 00:00:00 2001
From: Jeffrey Lovitz <jeffrey.lovitz@gmail.com>
Date: Thu, 26 Dec 2019 13:20:16 -0500
Subject: [PATCH 11/15] WIP

---
 bulk_insert/bulk_insert.py   |  3 +--
 bulk_insert/entity_file.py   | 36 +++++++++++++++++++++++++++++++++++-
 bulk_insert/label.py         |  4 ++--
 bulk_insert/relation_type.py |  3 +--
 bulk_insert/schema.py        | 28 ----------------------------
 5 files changed, 39 insertions(+), 35 deletions(-)
 delete mode 100644 bulk_insert/schema.py

diff --git a/bulk_insert/bulk_insert.py b/bulk_insert/bulk_insert.py
index bbd6dc9..595e075 100644
--- a/bulk_insert/bulk_insert.py
+++ b/bulk_insert/bulk_insert.py
@@ -51,10 +51,9 @@ def Config_Set(max_token_count, max_buffer_size, max_token_size, skip_invalid_no
 
 
 def QueryBuf_Set(graphname, client, has_relations):
-    QueryBuffer.graphname = graphname
-
     # Redis client and data for each query
     QueryBuffer.client = client
+    QueryBuffer.graphname = graphname
 
     # Create a node dictionary if we're building relations and as such require unique identifiers
     if has_relations:
diff --git a/bulk_insert/entity_file.py b/bulk_insert/entity_file.py
index fdedea2..d5d188e 100644
--- a/bulk_insert/entity_file.py
+++ b/bulk_insert/entity_file.py
@@ -5,7 +5,41 @@
 import struct
 import configs
 from exceptions import CSVError, SchemaError
-from schema import Type, convert_schema_type
+
+
+class Type:
+    NULL = 0
+    BOOL = 1
+    DOUBLE = 2
+    STRING = 3
+    INTEGER = 4
+    ID = 5
+    LABEL = 6
+    TYPE = 7
+    START_ID = 8
+    END_ID = 9
+    IGNORE = 10
+
+
+def convert_schema_type(in_type):
+    try:
+        return {
+                'null': Type.NULL,
+                'boolean': Type.BOOL,
+                'double': Type.DOUBLE,
+                'string': Type.STRING,
+                'integer': Type.INTEGER,
+                'id': Type.ID,
+                'label': Type.LABEL,
+                'type': Type.TYPE,
+                'start_id': Type.START_ID,
+                'end_id': Type.END_ID
+                }[in_type]
+    except KeyError:
+        if in_type.startswith('id('):
+            return Type.ID
+        else:
+            raise SchemaError("Encountered invalid field type '%s'" % in_type)
 
 
 # Convert a single CSV property field into a binary stream.
diff --git a/bulk_insert/label.py b/bulk_insert/label.py
index 1a0d40d..cffeab8 100644
--- a/bulk_insert/label.py
+++ b/bulk_insert/label.py
@@ -2,9 +2,8 @@
 import click
 import configs
 import query_buffer as QueryBuffer
-from entity_file import EntityFile
+from entity_file import Type, EntityFile
 from exceptions import SchemaError
-from schema import Type
 
 
 # Handler class for processing label csv files.
@@ -15,6 +14,7 @@ def __init__(self, infile):
         if self.types.count(Type.ID) != 1:
             raise SchemaError("Node file '%s' should have exactly one ID column."
                               % (infile.name))
+        # TODO tmp, ID namespacing
 
     def process_entities(self):
         entities_created = 0
diff --git a/bulk_insert/relation_type.py b/bulk_insert/relation_type.py
index 793dad8..b550d97 100644
--- a/bulk_insert/relation_type.py
+++ b/bulk_insert/relation_type.py
@@ -2,9 +2,8 @@
 import click
 import configs
 import query_buffer as QueryBuffer
-from entity_file import EntityFile
+from entity_file import Type, EntityFile
 from exceptions import CSVError, SchemaError
-from schema import Type
 
 
 # Handler class for processing relation csv files.
diff --git a/bulk_insert/schema.py b/bulk_insert/schema.py
deleted file mode 100644
index 1f9b590..0000000
--- a/bulk_insert/schema.py
+++ /dev/null
@@ -1,28 +0,0 @@
-# Official enum support varies widely between 2.7 and 3.x, so we'll use a custom class
-class Type:
-    NULL = 0
-    BOOL = 1
-    DOUBLE = 2
-    STRING = 3
-    INTEGER = 4
-    ID = 5
-    LABEL = 6
-    TYPE = 7
-    START_ID = 8
-    END_ID = 9
-    IGNORE = 10
-
-
-def convert_schema_type(in_type):
-    return {
-        'null': Type.NULL,
-        'boolean': Type.BOOL,
-        'double': Type.DOUBLE,
-        'string': Type.STRING,
-        'integer': Type.INTEGER,
-        'id': Type.ID,
-        'label': Type.LABEL,
-        'type': Type.TYPE,
-        'start_id': Type.START_ID,
-        'end_id': Type.END_ID
-        }[in_type]

From 9ed18008d00493a278fdba55c561bb4382d0d7cb Mon Sep 17 00:00:00 2001
From: Jeffrey Lovitz <jeffrey.lovitz@gmail.com>
Date: Thu, 26 Dec 2019 15:43:55 -0500
Subject: [PATCH 12/15] ID namespaces

---
 bulk_insert/entity_file.py   | 25 ++++++++++++++++++++-----
 bulk_insert/label.py         | 24 +++++++++++++++++++-----
 bulk_insert/relation_type.py | 22 ++++++++++++++++++++--
 3 files changed, 59 insertions(+), 12 deletions(-)

diff --git a/bulk_insert/entity_file.py b/bulk_insert/entity_file.py
index d5d188e..a938deb 100644
--- a/bulk_insert/entity_file.py
+++ b/bulk_insert/entity_file.py
@@ -28,7 +28,13 @@ def convert_schema_type(in_type):
                 'boolean': Type.BOOL,
                 'double': Type.DOUBLE,
                 'string': Type.STRING,
-                'integer': Type.INTEGER,
+                'string[]': Type.STRING, # TODO tmp
+                #  'integer': Type.INTEGER,
+                #  'int': Type.INTEGER,
+                #  'long': Type.INTEGER,
+                'integer': Type.DOUBLE,
+                'int': Type.DOUBLE,
+                'long': Type.DOUBLE,
                 'id': Type.ID,
                 'label': Type.LABEL,
                 'type': Type.TYPE,
@@ -36,8 +42,13 @@ def convert_schema_type(in_type):
                 'end_id': Type.END_ID
                 }[in_type]
     except KeyError:
+        # TODO tmp
         if in_type.startswith('id('):
             return Type.ID
+        elif in_type.startswith('start_id('):
+            return Type.START_ID
+        elif in_type.startswith('end_id('):
+            return Type.END_ID
         else:
             raise SchemaError("Encountered invalid field type '%s'" % in_type)
 
@@ -82,7 +93,7 @@ def prop_to_binary(prop_val, prop_type):
         return struct.pack(format_str, Type.STRING, encoded_str)
 
     # If it hasn't returned by this point, it is trying to set it to a type that it can't adopt
-    raise Exception("unable to parse [" + prop_val + "] with type ["+repr(type)+"]")
+    raise Exception("unable to parse [" + prop_val + "] with type ["+repr(prop_type)+"]")
 
 
 # Superclass for label and relation CSV files
@@ -105,7 +116,7 @@ def __init__(self, filename):
         self.convert_header()
 
         self.count_entities() # number of entities/row in file.
-        next(self.reader) # Skip header for next read.
+        #  next(self.reader) # Skip header for next read.
 
     # Count number of rows in file.
     def count_entities(self):
@@ -153,10 +164,14 @@ def convert_header(self):
             pair = field.split(':')
             if len(pair) > 2:
                 raise CSVError("Field '%s' had %d colons" % field, len(field))
-            elif len(pair) < 2:
+
+            if (len(pair[0]) == 0): # Delete empty string in a case like ":LABEL"
+                del pair[0]
+
+            if len(pair) < 2:
                 self.types[idx] = convert_schema_type(pair[0].casefold())
                 self.skip_offsets[idx] = True
-                if self.types[idx] not in (Type.ID, Type.START_ID, Type.END_ID, Type.IGNORE):
+                if self.types[idx] not in (Type.ID, Type.START_ID, Type.END_ID, Type.IGNORE, Type.LABEL): # TODO label
                     # Any other field should have 2 elements
                     raise SchemaError("Each property in the header should be a colon-separated pair")
             else:
diff --git a/bulk_insert/label.py b/bulk_insert/label.py
index cffeab8..f2d2c07 100644
--- a/bulk_insert/label.py
+++ b/bulk_insert/label.py
@@ -1,3 +1,4 @@
+import re
 import sys
 import click
 import configs
@@ -10,11 +11,20 @@
 class Label(EntityFile):
     def __init__(self, infile):
         super(Label, self).__init__(infile)
+        self.post_process_header()
+
+    def post_process_header(self):
         # Verify that exactly one field is labeled ID.
         if self.types.count(Type.ID) != 1:
             raise SchemaError("Node file '%s' should have exactly one ID column."
-                              % (infile.name))
-        # TODO tmp, ID namespacing
+                              % (self.infile.name))
+        header = next(self.reader)
+        self.id = self.types.index(Type.ID) # Track the offset containing the node ID.
+        id_field = header[self.id]
+        # If the ID field specifies an ID namespace in parentheses like "val:ID(NAMESPACE)", capture the namespace.
+        match = re.search(r"\((\w+)\)", id_field)
+        if match:
+            self.id_namespace = match.group(1)
 
     def process_entities(self):
         entities_created = 0
@@ -23,12 +33,16 @@ def process_entities(self):
                 self.validate_row(row)
                 # Add identifier->ID pair to dictionary if we are building relations
                 if QueryBuffer.nodes is not None:
-                    if row[0] in QueryBuffer.nodes:
+                    id_field = row[self.id]
+                    if self.id_namespace:
+                        id_field = self.id_namespace + '.' + str(id_field)
+
+                    if id_field in QueryBuffer.nodes:
                         sys.stderr.write("Node identifier '%s' was used multiple times - second occurrence at %s:%d\n"
-                                         % (row[0], self.infile.name, self.reader.line_num))
+                                         % (row[self.id], self.infile.name, self.reader.line_num))
                         if configs.skip_invalid_nodes is False:
                             sys.exit(1)
-                    QueryBuffer.nodes[row[0]] = QueryBuffer.top_node_id
+                    QueryBuffer.nodes[id_field] = QueryBuffer.top_node_id
                     QueryBuffer.top_node_id += 1
                 row_binary = self.pack_props(row)
                 row_binary_len = len(row_binary)
diff --git a/bulk_insert/relation_type.py b/bulk_insert/relation_type.py
index b550d97..b93c9c0 100644
--- a/bulk_insert/relation_type.py
+++ b/bulk_insert/relation_type.py
@@ -1,3 +1,4 @@
+import re
 import struct
 import click
 import configs
@@ -29,6 +30,14 @@ def post_process_header(self):
 
         self.start_id = self.types.index(Type.START_ID)
         self.end_id = self.types.index(Type.END_ID)
+        # Capture namespaces of start and end IDs if provided
+        header = next(self.reader)
+        start_match = re.search(r"\((\w+)\)", header[self.start_id])
+        if start_match:
+            self.start_namespace = start_match.group(1)
+        end_match = re.search(r"\((\w+)\)", header[self.end_id])
+        if end_match:
+            self.end_namespace = end_match.group(1)
 
     def process_entities(self):
         entities_created = 0
@@ -36,11 +45,20 @@ def process_entities(self):
             for row in reader:
                 self.validate_row(row)
                 try:
-                    src = QueryBuffer.nodes[row[self.start_id]]
-                    dest = QueryBuffer.nodes[row[self.end_id]]
+                    start_id = row[self.start_id]
+                    if self.start_namespace:
+                        start_id = self.start_namespace + '.' + str(start_id)
+                    end_id = row[self.end_id]
+                    if self.end_namespace:
+                        end_id = self.end_namespace + '.' + str(end_id)
+
+                    src = QueryBuffer.nodes[start_id]
+                    dest = QueryBuffer.nodes[end_id]
                 except KeyError as e:
                     print("Relationship specified a non-existent identifier. src: %s; dest: %s" % (row[self.start_id], row[self.end_id]))
                     if configs.skip_invalid_edges is False:
+                        import ipdb
+                        ipdb.set_trace()
                         raise e
                     continue
                 fmt = "=QQ" # 8-byte unsigned ints for src and dest

From c69aa907f953f489f60a4935f8c50cdf218f6e86 Mon Sep 17 00:00:00 2001
From: Jeffrey Lovitz <jeffrey.lovitz@gmail.com>
Date: Mon, 30 Dec 2019 14:59:38 -0500
Subject: [PATCH 13/15] Updates to schema types

---
 bulk_insert/entity_file.py | 33 +++++++++++++++------------------
 1 file changed, 15 insertions(+), 18 deletions(-)

diff --git a/bulk_insert/entity_file.py b/bulk_insert/entity_file.py
index a938deb..4f5e93c 100644
--- a/bulk_insert/entity_file.py
+++ b/bulk_insert/entity_file.py
@@ -12,10 +12,8 @@ class Type:
     BOOL = 1
     DOUBLE = 2
     STRING = 3
-    INTEGER = 4
+    LONG = 4
     ID = 5
-    LABEL = 6
-    TYPE = 7
     START_ID = 8
     END_ID = 9
     IGNORE = 10
@@ -29,15 +27,10 @@ def convert_schema_type(in_type):
                 'double': Type.DOUBLE,
                 'string': Type.STRING,
                 'string[]': Type.STRING, # TODO tmp
-                #  'integer': Type.INTEGER,
-                #  'int': Type.INTEGER,
-                #  'long': Type.INTEGER,
-                'integer': Type.DOUBLE,
-                'int': Type.DOUBLE,
-                'long': Type.DOUBLE,
+                'integer': Type.LONG,
+                'int': Type.LONG,
+                'long': Type.LONG,
                 'id': Type.ID,
-                'label': Type.LABEL,
-                'type': Type.TYPE,
                 'start_id': Type.START_ID,
                 'end_id': Type.END_ID
                 }[in_type]
@@ -72,6 +65,13 @@ def prop_to_binary(prop_val, prop_type):
         except:
             raise SchemaError("Could not parse '%s' as a double" % prop_val)
 
+    if prop_type is None or prop_type == Type.LONG:
+        try:
+            numeric_prop = int(float(prop_val))
+            return struct.pack(format_str + "q", Type.LONG, numeric_prop)
+        except:
+            raise SchemaError("Could not parse '%s' as a long" % prop_val)
+
     if prop_type is None or prop_type == Type.BOOL:
         # If field is 'false' or 'true', it is a boolean
         if prop_val.lower() == 'false':
@@ -86,7 +86,7 @@ def prop_to_binary(prop_val, prop_type):
         format_str += "%ds" % (len(encoded_str) + 1)
         return struct.pack(format_str, Type.STRING, encoded_str)
 
-    if prop_type in (Type.LABEL, Type.TYPE, Type.ID): # TODO tmp, treat as string for testing
+    if prop_type is Type.ID: # TODO tmp, treat as string for testing
         encoded_str = str.encode(prop_val) # struct.pack requires bytes objects as arguments
         # Encoding len+1 adds a null terminator to the string
         format_str += "%ds" % (len(encoded_str) + 1)
@@ -112,11 +112,8 @@ def __init__(self, filename):
         self.binary_entities = []
         self.binary_size = 0 # size of binary token
 
-        # Extract data from header row.
-        self.convert_header()
-
-        self.count_entities() # number of entities/row in file.
-        #  next(self.reader) # Skip header for next read.
+        self.convert_header() # Extract data from header row.
+        self.count_entities() # Count number of entities/row in file.
 
     # Count number of rows in file.
     def count_entities(self):
@@ -171,7 +168,7 @@ def convert_header(self):
             if len(pair) < 2:
                 self.types[idx] = convert_schema_type(pair[0].casefold())
                 self.skip_offsets[idx] = True
-                if self.types[idx] not in (Type.ID, Type.START_ID, Type.END_ID, Type.IGNORE, Type.LABEL): # TODO label
+                if self.types[idx] not in (Type.ID, Type.START_ID, Type.END_ID, Type.IGNORE):
                     # Any other field should have 2 elements
                     raise SchemaError("Each property in the header should be a colon-separated pair")
             else:

From 9efd3c4fdc9785a5f141330ac64149a51f2fb26c Mon Sep 17 00:00:00 2001
From: Jeffrey Lovitz <jeffrey.lovitz@gmail.com>
Date: Mon, 30 Dec 2019 15:00:30 -0500
Subject: [PATCH 14/15] Allow command-line specification of labels/reltypes

---
 bulk_insert/__init__.py      |  2 --
 bulk_insert/bulk_insert.py   | 25 +++++++++++++++----------
 bulk_insert/entity_file.py   | 11 +++++++----
 bulk_insert/label.py         |  4 ++--
 bulk_insert/query_buffer.py  |  2 +-
 bulk_insert/relation_type.py |  6 ++----
 6 files changed, 27 insertions(+), 23 deletions(-)

diff --git a/bulk_insert/__init__.py b/bulk_insert/__init__.py
index 0051d79..95869b0 100644
--- a/bulk_insert/__init__.py
+++ b/bulk_insert/__init__.py
@@ -1,9 +1,7 @@
 from .label import Label
 from .relation_type import RelationType
-from .schema import *
 from .query_buffer import QueryBuffer
 from .exceptions import (
         CSVError,
         SchemaError
 )
-#  from .entity_file import EntityFile
diff --git a/bulk_insert/bulk_insert.py b/bulk_insert/bulk_insert.py
index 595e075..0ec542b 100644
--- a/bulk_insert/bulk_insert.py
+++ b/bulk_insert/bulk_insert.py
@@ -8,11 +8,16 @@
 from relation_type import RelationType
 
 
-def parse_schemas(cls, csvs):
-    schemas = [None] * len(csvs)
-    for idx, in_csv in enumerate(csvs):
+def parse_schemas(cls, path_to_csv, csv_tuples):
+    schemas = [None] * (len(path_to_csv) + len(csv_tuples))
+    for idx, in_csv in enumerate(path_to_csv):
         # Build entity descriptor from input CSV
-        schemas[idx] = cls(in_csv)
+        schemas[idx] = cls(in_csv, None)
+
+    offset = len(path_to_csv)
+    for idx, csv_tuple in enumerate(csv_tuples):
+        # Build entity descriptor from input CSV
+        schemas[idx + offset] = cls(csv_tuple[1], csv_tuple[0])
     return schemas
 
 
@@ -69,7 +74,9 @@ def QueryBuf_Set(graphname, client, has_relations):
 @click.option('--password', '-a', default=None, help='Redis server password')
 # CSV file paths
 @click.option('--nodes', '-n', required=True, multiple=True, help='Path to node csv file')
+@click.option('--nodes-with-label', '-N', nargs=2, multiple=True, help='Label string followed by path to node csv file')
 @click.option('--relations', '-r', multiple=True, help='Path to relation csv file')
+@click.option('--relations-with-type', '-R', nargs=2, multiple=True, help='Relation type string followed by path to relation csv file')
 @click.option('--separator', '-o', default=',', help='Field token separator in csv file')
 # Buffer size restrictions
 @click.option('--max-token-count', '-c', default=1024, help='max number of processed CSVs to send per query (default 1024)')
@@ -78,7 +85,7 @@ def QueryBuf_Set(graphname, client, has_relations):
 @click.option('--quote', '-q', default=3, help='the quoting format used in the CSV file. QUOTE_MINIMAL=0,QUOTE_ALL=1,QUOTE_NONNUMERIC=2,QUOTE_NONE=3')
 @click.option('--skip-invalid-nodes', '-s', default=False, is_flag=True, help='ignore nodes that use previously defined IDs')
 @click.option('--skip-invalid-edges', '-e', default=False, is_flag=True, help='ignore invalid edges, print an error message and continue loading (True), or stop loading after an edge loading failure (False)')
-def bulk_insert(graph, host, port, password, nodes, relations, separator, max_token_count, max_buffer_size, max_token_size, quote, skip_invalid_nodes, skip_invalid_edges):
+def bulk_insert(graph, host, port, password, nodes, nodes_with_label, relations, relations_with_type, separator, max_token_count, max_buffer_size, max_token_size, quote, skip_invalid_nodes, skip_invalid_edges):
     if sys.version_info[0] < 3:
         raise Exception("Python 3 is required for the RedisGraph bulk loader.")
 
@@ -112,13 +119,11 @@ def bulk_insert(graph, host, port, password, nodes, relations, separator, max_to
     QueryBuf_Set(graph, client, relations is not None)
 
     # Read the header rows of each input CSV and save its schema.
-    labels = parse_schemas(Label, nodes)
-    reltypes = parse_schemas(RelationType, relations)
+    labels = parse_schemas(Label, nodes, nodes_with_label)
+    reltypes = parse_schemas(RelationType, relations, relations_with_type)
 
     process_entities(labels)
-
-    if relations:
-        process_entities(reltypes)
+    process_entities(reltypes)
 
     # Send all remaining tokens to Redis
     QueryBuffer.send_buffer()
diff --git a/bulk_insert/entity_file.py b/bulk_insert/entity_file.py
index 4f5e93c..58609a2 100644
--- a/bulk_insert/entity_file.py
+++ b/bulk_insert/entity_file.py
@@ -98,12 +98,15 @@ def prop_to_binary(prop_val, prop_type):
 
 # Superclass for label and relation CSV files
 class EntityFile(object):
-    def __init__(self, filename):
-
+    def __init__(self, filename, label):
         # The label or relation type string is the basename of the file
-        self.entity_str = os.path.splitext(os.path.basename(filename))[0]
+        if label:
+            self.entity_str = label
+        else:
+            self.entity_str = os.path.splitext(os.path.basename(filename))[0]
         # Input file handling
         self.infile = io.open(filename, 'rt')
+
         # Initialize CSV reader that ignores leading whitespace in each field
         # and does not modify input quote characters
         self.reader = csv.reader(self.infile, delimiter=configs.separator, skipinitialspace=True, quoting=configs.quoting)
@@ -162,7 +165,7 @@ def convert_header(self):
             if len(pair) > 2:
                 raise CSVError("Field '%s' had %d colons" % field, len(field))
 
-            if (len(pair[0]) == 0): # Delete empty string in a case like ":LABEL"
+            if len(pair[0]) == 0: # Delete empty string in a case like ":LABEL"
                 del pair[0]
 
             if len(pair) < 2:
diff --git a/bulk_insert/label.py b/bulk_insert/label.py
index f2d2c07..cd12443 100644
--- a/bulk_insert/label.py
+++ b/bulk_insert/label.py
@@ -9,8 +9,8 @@
 
 # Handler class for processing label csv files.
 class Label(EntityFile):
-    def __init__(self, infile):
-        super(Label, self).__init__(infile)
+    def __init__(self, infile, label_str):
+        super(Label, self).__init__(infile, label_str)
         self.post_process_header()
 
     def post_process_header(self):
diff --git a/bulk_insert/query_buffer.py b/bulk_insert/query_buffer.py
index 88e132d..12cfe91 100644
--- a/bulk_insert/query_buffer.py
+++ b/bulk_insert/query_buffer.py
@@ -1,4 +1,4 @@
-# QueryBuffer is the module that processes input CSVs and emits their binary formats to the Redis client.
+# QueryBuffer is the singleton module that processes input CSVs and emits their binary formats to the Redis client.
 
 nodes = None
 top_node_id = 0
diff --git a/bulk_insert/relation_type.py b/bulk_insert/relation_type.py
index b93c9c0..6baf810 100644
--- a/bulk_insert/relation_type.py
+++ b/bulk_insert/relation_type.py
@@ -9,8 +9,8 @@
 
 # Handler class for processing relation csv files.
 class RelationType(EntityFile):
-    def __init__(self, infile):
-        super(RelationType, self).__init__(infile)
+    def __init__(self, infile, type_str):
+        super(RelationType, self).__init__(infile, type_str)
         if self.column_count < 2:
             raise CSVError("Relation file '%s' should have at least 2 elements in header line."
                            % (infile.name))
@@ -57,8 +57,6 @@ def process_entities(self):
                 except KeyError as e:
                     print("Relationship specified a non-existent identifier. src: %s; dest: %s" % (row[self.start_id], row[self.end_id]))
                     if configs.skip_invalid_edges is False:
-                        import ipdb
-                        ipdb.set_trace()
                         raise e
                     continue
                 fmt = "=QQ" # 8-byte unsigned ints for src and dest

From c439eec40c3d4f3d57ee127783d33f909685ff26 Mon Sep 17 00:00:00 2001
From: filipecosta90 <filipecosta.90@gmail.com>
Date: Fri, 3 Jan 2020 12:01:33 +0000
Subject: [PATCH 15/15] [add] preparing for parallel bulk insert

---
 .circleci/config.yml         |   4 +-
 .gitignore                   |  79 ++++++++++++++++++++++++++
 bulk_insert/bulk_insert.py   |  36 +++++-------
 bulk_insert/entity_file.py   |   3 -
 bulk_insert/label.py         |  23 ++++----
 bulk_insert/query_buffer.py  | 106 ++++++++++++++++++-----------------
 bulk_insert/relation_type.py |  20 ++++---
 example/Country.csv          |   2 +-
 example/KNOWS.csv            |   2 +-
 example/Person.csv           |   2 +-
 example/VISITED.csv          |   2 +-
 example2/Robots.csv          |   2 +-
 12 files changed, 176 insertions(+), 105 deletions(-)

diff --git a/.circleci/config.yml b/.circleci/config.yml
index 01ce13f..aa79335 100644
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@@ -38,8 +38,8 @@ jobs:
           name: examples
           command: |
             . venv/bin/activate
-            python3 bulk_insert.py -n ./example/Person.csv -n ./example/Country.csv -r ./example/KNOWS.csv -r ./example/VISITED.csv Visits
-            python3 bulk_insert.py -n ./example2/Robots.csv Robots
+            python3 bulk_insert/bulk_insert.py -n ./example/Person.csv -n ./example/Country.csv -r ./example/KNOWS.csv -r ./example/VISITED.csv Visits
+            python3 bulk_insert/bulk_insert.py -n ./example2/Robots.csv Robots
 
 workflows:
   version: 2
diff --git a/.gitignore b/.gitignore
index db3e642..1d09fde 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,3 +1,82 @@
+# Covers JetBrains IDEs: IntelliJ, RubyMine, PhpStorm, AppCode, PyCharm, CLion, Android Studio and WebStorm
+# Reference: https://intellij-support.jetbrains.com/hc/en-us/articles/206544839
+
+# User-specific stuff
+.idea/**/workspace.xml
+.idea/**/tasks.xml
+.idea/**/usage.statistics.xml
+.idea/**/dictionaries
+.idea/**/shelf
+
+# Generated files
+.idea/**/contentModel.xml
+
+# Sensitive or high-churn files
+.idea/**/dataSources/
+.idea/**/dataSources.ids
+.idea/**/dataSources.local.xml
+.idea/**/sqlDataSources.xml
+.idea/**/dynamic.xml
+.idea/**/uiDesigner.xml
+.idea/**/dbnavigator.xml
+
+# Gradle
+.idea/**/gradle.xml
+.idea/**/libraries
+.idea/**
+
+# Gradle and Maven with auto-import
+# When using Gradle or Maven with auto-import, you should exclude module files,
+# since they will be recreated, and may cause churn.  Uncomment if using
+# auto-import.
+# .idea/modules.xml
+# .idea/*.iml
+# .idea/modules
+# *.iml
+# *.ipr
+
+# CMake
+cmake-build-*/
+
+# Mongo Explorer plugin
+.idea/**/mongoSettings.xml
+
+# File-based project format
+*.iws
+
+# IntelliJ
+out/
+
+# mpeltonen/sbt-idea plugin
+.idea_modules/
+
+# JIRA plugin
+atlassian-ide-plugin.xml
+
+# Cursive Clojure plugin
+.idea/replstate.xml
+
+# Crashlytics plugin (for Android Studio and IntelliJ)
+com_crashlytics_export_strings.xml
+crashlytics.properties
+crashlytics-build.properties
+fabric.properties
+
+# Editor-based Rest Client
+.idea/httpRequests
+
+# Android studio 3.1+ serialized cache file
+.idea/caches/build_file_checksums.ser
+
+
+# VisualStudioCode
+.vscode/*
+!.vscode/settings.json
+!.vscode/tasks.json
+!.vscode/launch.json
+!.vscode/extensions.json
+*.code-workspace
+
 # Byte-compiled / optimized / DLL files
 __pycache__/
 *.py[cod]
diff --git a/bulk_insert/bulk_insert.py b/bulk_insert/bulk_insert.py
index 0ec542b..721d2bc 100644
--- a/bulk_insert/bulk_insert.py
+++ b/bulk_insert/bulk_insert.py
@@ -3,7 +3,7 @@
 import redis
 import click
 import configs
-import query_buffer as QueryBuffer
+from query_buffer import QueryBuffer
 from label import Label
 from relation_type import RelationType
 
@@ -23,18 +23,18 @@ def parse_schemas(cls, path_to_csv, csv_tuples):
 
 # For each input file, validate contents and convert to binary format.
 # If any buffer limits have been reached, flush all enqueued inserts to Redis.
-def process_entities(entities):
+def process_entities(querybuffer, entities):
     for entity in entities:
-        entity.process_entities()
+        entity.process_entities(querybuffer)
         added_size = entity.binary_size
         # Check to see if the addition of this data will exceed the buffer's capacity
-        if (QueryBuffer.buffer_size + added_size >= configs.max_buffer_size
-                or QueryBuffer.redis_token_count + len(entity.binary_entities) >= configs.max_token_count):
+        if (querybuffer.buffer_size + added_size >= configs.max_buffer_size
+                or querybuffer.redis_token_count + len(entity.binary_entities) >= configs.max_token_count):
             # Send and flush the buffer if appropriate
-            QueryBuffer.send_buffer()
+            querybuffer.send_buffer()
         # Add binary data to list and update all counts
-        QueryBuffer.redis_token_count += len(entity.binary_entities)
-        QueryBuffer.buffer_size += added_size
+        querybuffer.redis_token_count += len(entity.binary_entities)
+        querybuffer.buffer_size += added_size
 
 
 def Config_Set(max_token_count, max_buffer_size, max_token_size, skip_invalid_nodes, skip_invalid_edges, separator, quoting):
@@ -55,16 +55,6 @@ def Config_Set(max_token_count, max_buffer_size, max_token_size, skip_invalid_no
     configs.quoting = quoting
 
 
-def QueryBuf_Set(graphname, client, has_relations):
-    # Redis client and data for each query
-    QueryBuffer.client = client
-    QueryBuffer.graphname = graphname
-
-    # Create a node dictionary if we're building relations and as such require unique identifiers
-    if has_relations:
-        QueryBuffer.nodes = {}
-
-
 # Command-line arguments
 @click.command()
 @click.argument('graph')
@@ -116,20 +106,20 @@ def bulk_insert(graph, host, port, password, nodes, nodes_with_label, relations,
         print("Graph with name '%s', could not be created, as Redis key '%s' already exists." % (graph, graph))
         sys.exit(1)
 
-    QueryBuf_Set(graph, client, relations is not None)
+    query_buffer = QueryBuffer(graph, client, relations is not None)
 
     # Read the header rows of each input CSV and save its schema.
     labels = parse_schemas(Label, nodes, nodes_with_label)
     reltypes = parse_schemas(RelationType, relations, relations_with_type)
 
-    process_entities(labels)
-    process_entities(reltypes)
+    process_entities(query_buffer,labels)
+    process_entities(query_buffer,reltypes)
 
     # Send all remaining tokens to Redis
-    QueryBuffer.send_buffer()
+    query_buffer.send_buffer()
 
     end_time = timer()
-    QueryBuffer.report_completion(end_time - start_time)
+    query_buffer.report_completion(end_time - start_time)
 
 
 if __name__ == '__main__':
diff --git a/bulk_insert/entity_file.py b/bulk_insert/entity_file.py
index 58609a2..0edfb11 100644
--- a/bulk_insert/entity_file.py
+++ b/bulk_insert/entity_file.py
@@ -6,7 +6,6 @@
 import configs
 from exceptions import CSVError, SchemaError
 
-
 class Type:
     NULL = 0
     BOOL = 1
@@ -18,7 +17,6 @@ class Type:
     END_ID = 9
     IGNORE = 10
 
-
 def convert_schema_type(in_type):
     try:
         return {
@@ -167,7 +165,6 @@ def convert_header(self):
 
             if len(pair[0]) == 0: # Delete empty string in a case like ":LABEL"
                 del pair[0]
-
             if len(pair) < 2:
                 self.types[idx] = convert_schema_type(pair[0].casefold())
                 self.skip_offsets[idx] = True
diff --git a/bulk_insert/label.py b/bulk_insert/label.py
index cd12443..0658733 100644
--- a/bulk_insert/label.py
+++ b/bulk_insert/label.py
@@ -2,7 +2,7 @@
 import sys
 import click
 import configs
-import query_buffer as QueryBuffer
+# import query_buffer as QueryBuffer
 from entity_file import Type, EntityFile
 from exceptions import SchemaError
 
@@ -10,6 +10,7 @@
 # Handler class for processing label csv files.
 class Label(EntityFile):
     def __init__(self, infile, label_str):
+        self.id_namespace = None
         super(Label, self).__init__(infile, label_str)
         self.post_process_header()
 
@@ -26,39 +27,39 @@ def post_process_header(self):
         if match:
             self.id_namespace = match.group(1)
 
-    def process_entities(self):
+    def process_entities(self, query_buffer):
         entities_created = 0
         with click.progressbar(self.reader, length=self.entities_count, label=self.entity_str) as reader:
             for row in reader:
                 self.validate_row(row)
                 # Add identifier->ID pair to dictionary if we are building relations
-                if QueryBuffer.nodes is not None:
+                if query_buffer.nodes is not None:
                     id_field = row[self.id]
                     if self.id_namespace:
                         id_field = self.id_namespace + '.' + str(id_field)
 
-                    if id_field in QueryBuffer.nodes:
+                    if id_field in query_buffer.nodes:
                         sys.stderr.write("Node identifier '%s' was used multiple times - second occurrence at %s:%d\n"
                                          % (row[self.id], self.infile.name, self.reader.line_num))
                         if configs.skip_invalid_nodes is False:
                             sys.exit(1)
-                    QueryBuffer.nodes[id_field] = QueryBuffer.top_node_id
-                    QueryBuffer.top_node_id += 1
+                    query_buffer.nodes[id_field] = query_buffer.top_node_id
+                    query_buffer.top_node_id += 1
                 row_binary = self.pack_props(row)
                 row_binary_len = len(row_binary)
                 # If the addition of this entity will make the binary token grow too large,
                 # send the buffer now.
                 if self.binary_size + row_binary_len > configs.max_token_size:
-                    QueryBuffer.labels.append(self.to_binary())
-                    QueryBuffer.send_buffer()
+                    query_buffer.labels.append(self.to_binary())
+                    query_buffer.send_buffer()
                     self.reset_partial_binary()
                     # Push the label onto the query buffer again, as there are more entities to process.
-                    QueryBuffer.labels.append(self.to_binary())
+                    query_buffer.labels.append(self.to_binary())
 
-                QueryBuffer.node_count += 1
+                query_buffer.node_count += 1
                 entities_created += 1
                 self.binary_size += row_binary_len
                 self.binary_entities.append(row_binary)
-            QueryBuffer.labels.append(self.to_binary())
+            query_buffer.labels.append(self.to_binary())
         self.infile.close()
         print("%d nodes created with label '%s'" % (entities_created, self.entity_str))
diff --git a/bulk_insert/query_buffer.py b/bulk_insert/query_buffer.py
index 12cfe91..4190754 100644
--- a/bulk_insert/query_buffer.py
+++ b/bulk_insert/query_buffer.py
@@ -1,72 +1,74 @@
 # QueryBuffer is the singleton module that processes input CSVs and emits their binary formats to the Redis client.
 
-nodes = None
-top_node_id = 0
 
-# Redis client and data for each query
-client = None
+class QueryBuffer:
+    def __init__(self ,graphname, client, has_relations):
+        # Redis client and data for each query
+        self.client = client
+        self.graphname = graphname
 
-# Sizes for buffer currently being constructed
-redis_token_count = 0
-buffer_size = 0
+        # Create a node dictionary if we're building relations and as such require unique identifiers
+        if has_relations:
+            self.nodes = {}
+        else:
+            self.nodes = None
 
-# The first query should include a "BEGIN" token
-graphname = ""
-initial_query = True
+        self.top_node_id = 0
 
-node_count = 0
-relation_count = 0
+        # Sizes for buffer currently being constructed
+        self.redis_token_count = 0
+        self.buffer_size = 0
 
-labels = [] # List containing all pending Label objects
-reltypes = [] # List containing all pending RelationType objects
+        # The first query should include a "BEGIN" token
+        self.initial_query = True
 
-nodes_created = 0 # Total number of nodes created
-relations_created = 0 # Total number of relations created
+        self.node_count = 0
+        self.relation_count = 0
 
+        self.labels = [] # List containing all pending Label objects
+        self.reltypes = [] # List containing all pending RelationType objects
 
-# Send all pending inserts to Redis
-def send_buffer():
-    global initial_query
-    global nodes_created
-    global relations_created
+        self.nodes_created = 0 # Total number of nodes created
+        self.relations_created = 0 # Total number of relations created
 
-    # Do nothing if we have no entities
-    if node_count == 0 and relation_count == 0:
-        return
 
-    args = [node_count, relation_count, len(labels), len(reltypes)] + labels + reltypes
-    # Prepend a "BEGIN" token if this is the first query
-    if initial_query:
-        args.insert(0, "BEGIN")
-        initial_query = False
+    # Send all pending inserts to Redis
+    def send_buffer(self):
+        # global initial_query
+        # global nodes_created
+        # global relations_created
 
-    result = client.execute_command("GRAPH.BULK", graphname, *args)
-    stats = result.split(', '.encode())
-    nodes_created += int(stats[0].split(' '.encode())[0])
-    relations_created += int(stats[1].split(' '.encode())[0])
+        # Do nothing if we have no entities
+        if self.node_count == 0 and self.relation_count == 0:
+            return
 
-    clear_buffer()
+        args = [self.node_count, self.relation_count, len(self.labels), len(self.reltypes)] + self.labels + self.reltypes
+        # Prepend a "BEGIN" token if this is the first query
+        if self.initial_query:
+            args.insert(0, "BEGIN")
+            initial_query = False
 
+        result = self.client.execute_command("GRAPH.BULK", self.graphname, *args)
+        stats = result.split(', '.encode())
+        self.nodes_created += int(stats[0].split(' '.encode())[0])
+        self.relations_created += int(stats[1].split(' '.encode())[0])
 
-# Delete all entities that have been inserted
-def clear_buffer():
-    global redis_token_count
-    global buffer_size
-    global node_count
-    global relation_count
-    global labels
-    global reltypes
+        self.clear_buffer()
 
-    redis_token_count = 0
-    buffer_size = 0
 
-    # All constructed entities have been inserted, so clear buffers
-    node_count = 0
-    relation_count = 0
-    del labels[:]
-    del reltypes[:]
+    # Delete all entities that have been inserted
+    def clear_buffer(self):
 
+        self.redis_token_count = 0
+        self.buffer_size = 0
 
-def report_completion(runtime):
-    print("Construction of graph '%s' complete: %d nodes created, %d relations created in %f seconds"
-          % (graphname, nodes_created, relations_created, runtime))
+        # All constructed entities have been inserted, so clear buffers
+        self.node_count = 0
+        self.relation_count = 0
+        del self.labels[:]
+        del self.reltypes[:]
+
+
+    def report_completion(self,runtime):
+        print("Construction of graph '%s' complete: %d nodes created, %d relations created in %f seconds"
+              % (self.graphname, self.nodes_created, self.relations_created, runtime))
diff --git a/bulk_insert/relation_type.py b/bulk_insert/relation_type.py
index 6baf810..4bb1df9 100644
--- a/bulk_insert/relation_type.py
+++ b/bulk_insert/relation_type.py
@@ -2,7 +2,7 @@
 import struct
 import click
 import configs
-import query_buffer as QueryBuffer
+# import query_buffer as QueryBuffer
 from entity_file import Type, EntityFile
 from exceptions import CSVError, SchemaError
 
@@ -16,7 +16,9 @@ def __init__(self, infile, type_str):
                            % (infile.name))
 
         self.start_id = -1
+        self.start_namespace = None
         self.end_id = -1
+        self.end_namespace = None
         self.post_process_header()
 
     def post_process_header(self):
@@ -39,7 +41,7 @@ def post_process_header(self):
         if end_match:
             self.end_namespace = end_match.group(1)
 
-    def process_entities(self):
+    def process_entities(self, query_buffer):
         entities_created = 0
         with click.progressbar(self.reader, length=self.entities_count, label=self.entity_str) as reader:
             for row in reader:
@@ -52,8 +54,8 @@ def process_entities(self):
                     if self.end_namespace:
                         end_id = self.end_namespace + '.' + str(end_id)
 
-                    src = QueryBuffer.nodes[start_id]
-                    dest = QueryBuffer.nodes[end_id]
+                    src = query_buffer.nodes[start_id]
+                    dest = query_buffer.nodes[end_id]
                 except KeyError as e:
                     print("Relationship specified a non-existent identifier. src: %s; dest: %s" % (row[self.start_id], row[self.end_id]))
                     if configs.skip_invalid_edges is False:
@@ -65,16 +67,16 @@ def process_entities(self):
                 # If the addition of this entity will make the binary token grow too large,
                 # send the buffer now.
                 if self.binary_size + row_binary_len > configs.max_token_size:
-                    QueryBuffer.reltypes.append(self.to_binary())
-                    QueryBuffer.send_buffer()
+                    query_buffer.reltypes.append(self.to_binary())
+                    query_buffer.send_buffer()
                     self.reset_partial_binary()
                     # Push the reltype onto the query buffer again, as there are more entities to process.
-                    QueryBuffer.reltypes.append(self.to_binary())
+                    query_buffer.reltypes.append(self.to_binary())
 
-                QueryBuffer.relation_count += 1
+                query_buffer.relation_count += 1
                 entities_created += 1
                 self.binary_size += row_binary_len
                 self.binary_entities.append(row_binary)
-            QueryBuffer.reltypes.append(self.to_binary())
+            query_buffer.reltypes.append(self.to_binary())
         self.infile.close()
         print("%d relations created for type '%s'" % (entities_created, self.entity_str))
diff --git a/example/Country.csv b/example/Country.csv
index 341528b..46c0eae 100644
--- a/example/Country.csv
+++ b/example/Country.csv
@@ -1,4 +1,4 @@
-name
+name:id
 USA
 Prague
 Japan
diff --git a/example/KNOWS.csv b/example/KNOWS.csv
index 5fb923c..1223103 100644
--- a/example/KNOWS.csv
+++ b/example/KNOWS.csv
@@ -1,4 +1,4 @@
-src, dest, relation
+src:start_id, dest:end_id, relation:string
 Roi Lipman, Alon Fital, friend
 Roi Lipman, Ailon Velger, friend
 Roi Lipman, Ori Laslo, friend
diff --git a/example/Person.csv b/example/Person.csv
index 5568edd..71dc4e3 100644
--- a/example/Person.csv
+++ b/example/Person.csv
@@ -1,4 +1,4 @@
-name,age,gender,status
+name:id,age:string,gender:string,status:string
 Roi Lipman,32,male,married
 Alon Fital,32,male,married
 Ailon Velger,32,male,married
diff --git a/example/VISITED.csv b/example/VISITED.csv
index c514d4f..75c64eb 100644
--- a/example/VISITED.csv
+++ b/example/VISITED.csv
@@ -1,4 +1,4 @@
-src_person, dest_country, purpose
+src:start_id, dest:end_id, purpose:string
 Roi Lipman,USA,business
 Roi Lipman,Prague,both
 Roi Lipman,Japan,pleasure
diff --git a/example2/Robots.csv b/example2/Robots.csv
index 2923602..1e8bbd3 100644
--- a/example2/Robots.csv
+++ b/example2/Robots.csv
@@ -1,4 +1,4 @@
-"name"
+"name":id
 "Beebop"
 "30165"
 "Chappy"