Refactor schema reading in header

jeffreylovitz · jeffreylovitz · commit 5501e43d4e0b · 2019-12-20T16:10:26.000-05:00
diff --git a/bulk_insert/bulk_insert.py b/bulk_insert/bulk_insert.py
@@ -10,12 +10,19 @@
 import module_vars
 
 
-# For each node input file, validate contents and convert to binary format.
-# If any buffer limits have been reached, flush all enqueued inserts to Redis.
-def process_entity_csvs(cls, csvs, separator):
-    for in_csv in csvs:
+def parse_schemas(cls, csvs):
+    schemas = [None] * len(csvs)
+    for idx, in_csv in enumerate(csvs):
         # Build entity descriptor from input CSV
-        entity = cls(in_csv, separator)
+        schemas[idx] = cls(in_csv)
+    return schemas
+
+
+# For each input file, validate contents and convert to binary format.
+# If any buffer limits have been reached, flush all enqueued inserts to Redis.
+def process_entities(entities):
+    for entity in entities:
+        entity.process_entities()
         added_size = entity.binary_size
         # Check to see if the addition of this data will exceed the buffer's capacity
         if (module_vars.QUERY_BUF.buffer_size + added_size >= module_vars.CONFIGS.max_buffer_size
@@ -42,24 +49,16 @@ def process_entity_csvs(cls, csvs, separator):
 @click.option('--max-buffer-size', '-b', default=2048, help='max buffer size in megabytes (default 2048)')
 @click.option('--max-token-size', '-t', default=500, help='max size of each token in megabytes (default 500, max 512)')
 @click.option('--quote', '-q', default=3, help='the quoting format used in the CSV file. QUOTE_MINIMAL=0,QUOTE_ALL=1,QUOTE_NONNUMERIC=2,QUOTE_NONE=3')
-@click.option('--field-types', '-f', default=None, help='json to set explicit types for each field, format {<label>:[<col1 type>, <col2 type> ...]} where type can be 0(null),1(bool),2(numeric),3(string)')
 @click.option('--skip-invalid-nodes', '-s', default=False, is_flag=True, help='ignore nodes that use previously defined IDs')
 @click.option('--skip-invalid-edges', '-e', default=False, is_flag=True, help='ignore invalid edges, print an error message and continue loading (True), or stop loading after an edge loading failure (False)')
-@click.option('--enforce-schema', '-S', default=False, is_flag=True, help='header line introduces property schema')
-def bulk_insert(graph, host, port, password, nodes, relations, separator, max_token_count, max_buffer_size, max_token_size, quote, field_types, skip_invalid_nodes, skip_invalid_edges, enforce_schema):
+def bulk_insert(graph, host, port, password, nodes, relations, separator, max_token_count, max_buffer_size, max_token_size, quote, skip_invalid_nodes, skip_invalid_edges):
     if sys.version_info[0] < 3:
         raise Exception("Python 3 is required for the RedisGraph bulk loader.")
 
-    if field_types is not None:
-        try:
-            module_vars.FIELD_TYPES = json.loads(field_types)
-        except:
-            raise Exception("Problem parsing field-types. Use the format {<label>:[<col1 type>, <col2 type> ...]} where type can be 0(null),1(bool),2(numeric),3(string) ")
-
     module_vars.QUOTING = int(quote)
 
     module_vars.TOP_NODE_ID = 0 # reset global ID variable (in case we are calling bulk_insert from unit tests)
-    module_vars.CONFIGS = Configs(max_token_count, max_buffer_size, max_token_size, skip_invalid_nodes, skip_invalid_edges, enforce_schema)
+    module_vars.CONFIGS = Configs(max_token_count, max_buffer_size, max_token_size, skip_invalid_nodes, skip_invalid_edges, separator)
 
     start_time = timer()
     # Attempt to connect to Redis server
@@ -85,9 +84,9 @@ def bulk_insert(graph, host, port, password, nodes, relations, separator, max_to
         print("Graph with name '%s', could not be created, as Redis key '%s' already exists." % (graph, graph))
         sys.exit(1)
 
-    # If we're enforcing a schema, validate the headers in each file?
-    if enforce_schema:
-        pass
+    # Read the header rows of each input CSV and save its schema.
+    labels = parse_schemas(Label, nodes)
+    reltypes = parse_schemas(RelationType, relations)
 
     module_vars.QUERY_BUF = QueryBuffer(graph, client)
 
@@ -97,10 +96,10 @@ def bulk_insert(graph, host, port, password, nodes, relations, separator, max_to
     else:
         module_vars.NODE_DICT = None
 
-    process_entity_csvs(Label, nodes, separator)
+    process_entities(labels)
 
     if relations:
-        process_entity_csvs(RelationType, relations, separator)
+        process_entities(reltypes)
 
     # Send all remaining tokens to Redis
     module_vars.QUERY_BUF.send_buffer()
diff --git a/bulk_insert/configs.py b/bulk_insert/configs.py
@@ -1,6 +1,6 @@
 # User-configurable thresholds for when to send queries to Redis
 class Configs(object):
-    def __init__(self, max_token_count, max_buffer_size, max_token_size, skip_invalid_nodes, skip_invalid_edges, enforce_schema):
+    def __init__(self, max_token_count, max_buffer_size, max_token_size, skip_invalid_nodes, skip_invalid_edges, separator):
         # Maximum number of tokens per query
         # 1024 * 1024 is the hard-coded Redis maximum. We'll set a slightly lower limit so
         # that we can safely ignore tokens that aren't binary strings
@@ -15,4 +15,4 @@ def __init__(self, max_token_count, max_buffer_size, max_token_size, skip_invali
         self.skip_invalid_nodes = skip_invalid_nodes
         self.skip_invalid_edges = skip_invalid_edges
 
-        self.enforce_schema = enforce_schema
+        self.separator = separator
diff --git a/bulk_insert/entity_file.py b/bulk_insert/entity_file.py
@@ -1,114 +1,151 @@
 import os
 import io
 import csv
+import math
 import struct
 import module_vars
-from exceptions import CSVError
-import schema
+import configs
+from exceptions import CSVError, SchemaError
+from schema import Type, convert_schema_type
 
 
 # Convert a single CSV property field into a binary stream.
 # Supported property types are string, numeric, boolean, and NULL.
 # type is either Type.DOUBLE, Type.BOOL or Type.STRING, and explicitly sets the value to this type if possible
-def prop_to_binary(prop_val, type):
-    # All format strings start with an unsigned char to represent our Type enum
+def prop_to_binary(prop_val, prop_type):
+    # All format strings start with an unsigned char to represent our prop_type enum
     format_str = "=B"
     if prop_val is None:
         # An empty field indicates a NULL property
         return struct.pack(format_str, Type.NULL)
 
     # If field can be cast to a float, allow it
-    if type == None or type == Type.DOUBLE:
+    if prop_type is None or prop_type == Type.DOUBLE:
         try:
             numeric_prop = float(prop_val)
             if not math.isnan(numeric_prop) and not math.isinf(numeric_prop): # Don't accept non-finite values.
                 return struct.pack(format_str + "d", Type.DOUBLE, numeric_prop)
         except:
-            pass
+            raise SchemaError("Could not parse '%s' as a double" % prop_val)
 
-    if type == None or type == Type.BOOL:
+    if prop_type is None or prop_type == Type.BOOL:
         # If field is 'false' or 'true', it is a boolean
         if prop_val.lower() == 'false':
             return struct.pack(format_str + '?', Type.BOOL, False)
         elif prop_val.lower() == 'true':
             return struct.pack(format_str + '?', Type.BOOL, True)
 
-    if type == None or type == Type.STRING:
+    if prop_type is None or prop_type == Type.STRING:
         # If we've reached this point, the property is a string
         encoded_str = str.encode(prop_val) # struct.pack requires bytes objects as arguments
         # Encoding len+1 adds a null terminator to the string
         format_str += "%ds" % (len(encoded_str) + 1)
-        return struct.pack(format_str, schema.Type.STRING, encoded_str)
+        return struct.pack(format_str, Type.STRING, encoded_str)
 
+    if prop_type in (Type.LABEL, Type.TYPE, Type.ID): # TODO tmp, treat as string for testing
+        encoded_str = str.encode(prop_val) # struct.pack requires bytes objects as arguments
+        # Encoding len+1 adds a null terminator to the string
+        format_str += "%ds" % (len(encoded_str) + 1)
+        return struct.pack(format_str, Type.STRING, encoded_str)
+
+    import ipdb
+    ipdb.set_trace()
     # If it hasn't returned by this point, it is trying to set it to a type that it can't adopt
     raise Exception("unable to parse [" + prop_val + "] with type ["+repr(type)+"]")
 
 
 # Superclass for label and relation CSV files
 class EntityFile(object):
-    def __init__(self, filename, separator):
+    def __init__(self, filename):
         # The label or relation type string is the basename of the file
         self.entity_str = os.path.splitext(os.path.basename(filename))[0]
         # Input file handling
         self.infile = io.open(filename, 'rt')
         # Initialize CSV reader that ignores leading whitespace in each field
         # and does not modify input quote characters
-        self.reader = csv.reader(self.infile, delimiter=separator, skipinitialspace=True, quoting=module_vars.QUOTING)
-
-        self.prop_offset = 0 # Starting index of properties in row
-        self.prop_count = 0 # Number of properties per entity
+        self.reader = csv.reader(self.infile, delimiter=module_vars.CONFIGS.separator, skipinitialspace=True, quoting=module_vars.QUOTING)
 
         self.packed_header = b''
         self.binary_entities = []
         self.binary_size = 0 # size of binary token
+
+        # Extract data from header row.
+        self.convert_header()
+
         self.count_entities() # number of entities/row in file.
+        next(self.reader) # Skip header for next read.
 
     # Count number of rows in file.
     def count_entities(self):
         self.entities_count = 0
         self.entities_count = sum(1 for line in self.infile)
-        # discard header row
-        self.entities_count -= 1
         # seek back
         self.infile.seek(0)
         return self.entities_count
 
     # Simple input validations for each row of a CSV file
-    def validate_row(self, expected_col_count, row):
+    def validate_row(self, row):
         # Each row should have the same number of fields
-        if len(row) != expected_col_count:
+        if len(row) != self.column_count:
             raise CSVError("%s:%d Expected %d columns, encountered %d ('%s')"
-                           % (self.infile.name, self.reader.line_num, expected_col_count, len(row), ','.join(row)))
+                           % (self.infile.name, self.reader.line_num, self.column_count, len(row), configs.separator.join(row)))
 
     # If part of a CSV file was sent to Redis, delete the processed entities and update the binary size
     def reset_partial_binary(self):
         self.binary_entities = []
         self.binary_size = len(self.packed_header)
 
     # Convert property keys from a CSV file header into a binary string
-    def pack_header(self, header):
-        prop_count = len(header) - self.prop_offset
+    def pack_header(self):
         # String format
         entity_bytes = self.entity_str.encode()
         fmt = "=%dsI" % (len(entity_bytes) + 1) # Unaligned native, entity name, count of properties
-        args = [entity_bytes, prop_count]
-        for p in header[self.prop_offset:]:
-            prop = p.encode()
+        args = [entity_bytes, self.prop_count]
+        for idx in range(self.column_count):
+            if self.skip_offsets[idx]:
+                continue
+            prop = self.column_names[idx].encode()
             fmt += "%ds" % (len(prop) + 1) # encode string with a null terminator
             args.append(prop)
         return struct.pack(fmt, *args)
 
+    # Extract column names and types from a header row
+    def convert_header(self):
+        header = next(self.reader)
+        self.column_count = len(header)
+        self.column_names = [None] * self.column_count   # Property names of every column.
+        self.types = [None] * self.column_count          # Value type of every column.
+        self.skip_offsets = [False] * self.column_count  # Whether column at any offset should not be stored as a property.
+
+        for idx, field in enumerate(header):
+            pair = field.split(':')
+            if len(pair) > 2:
+                raise CSVError("Field '%s' had %d colons" % field, len(field))
+            elif len(pair) < 2:
+                self.types[idx] = convert_schema_type(pair[0].casefold())
+                self.skip_offsets[idx] = True
+                if self.types[idx] not in (Type.ID, Type.START_ID, Type.END_ID, Type.IGNORE):
+                    # Any other field should have 2 elements
+                    raise SchemaError("Each property in the header should be a colon-separated pair")
+            else:
+                self.column_names[idx] = pair[0]
+                self.types[idx] = convert_schema_type(pair[1].casefold())
+                if self.types[idx] in (Type.START_ID, Type.END_ID, Type.IGNORE):
+                    self.skip_offsets[idx] = True
+
+        # The number of properties is equal to the number of non-skipped columns.
+        self.prop_count = self.skip_offsets.count(False)
+        self.packed_header = self.pack_header()
+        self.binary_size += len(self.packed_header)
+
     # Convert a list of properties into a binary string
     def pack_props(self, line):
         props = []
-        for num, field in enumerate(line[self.prop_offset:]):
-            field_type_idx = self.prop_offset+num
-            try:
-                module_vars.FIELD_TYPES[self.entity_str][field_type_idx]
-            except:
-                props.append(prop_to_binary(field, None))
-            else:
-                props.append(prop_to_binary(field, module_vars.FIELD_TYPES[self.entity_str][field_type_idx]))
+        for idx, field in enumerate(line):
+            if self.skip_offsets[idx]:
+                continue
+            if self.column_names[idx]:
+                props.append(prop_to_binary(field, self.types[idx]))
         return b''.join(p for p in props)
 
     def to_binary(self):
diff --git a/bulk_insert/label.py b/bulk_insert/label.py
@@ -4,46 +4,24 @@
 from configs import Configs
 from exceptions import SchemaError
 import module_vars
-import schema
+from schema import Type
+from schema import convert_schema_type
 
 
 # Handler class for processing label csv files.
 class Label(EntityFile):
-    def __init__(self, infile, separator):
-        super(Label, self).__init__(infile, separator)
-        expected_col_count = self.process_header()
-        self.process_entities(expected_col_count)
-        self.infile.close()
-
-    def process_header_schema(self, header):
-        prop_count = len(header)
-        self.types = [None] * prop_count
-        for i, prop in enumerate(header):
-            pair = prop.split(':')
-            if len(pair) != 2:
-                raise SchemaError("Each header entry should be a colon-separated pair")
-            self.types[i] = schema.convert_schema_type(pair[1].casefold())
-
-    def process_header(self):
-        # Header format:
-        # node identifier (which may be a property key), then all other property keys
-        header = next(self.reader)
-        expected_col_count = len(header)
+    def __init__(self, infile):
+        super(Label, self).__init__(infile)
+        # Verify that exactly one field is labeled ID.
+        if self.types.count(Type.ID) != 1:
+            raise SchemaError("Node file '%s' should have exactly one ID column."
+                              % (infile.name))
 
-        if module_vars.CONFIGS.enforce_schema:
-            self.process_header_schema(header)
-        # If identifier field begins with an underscore, don't add it as a property.
-        if header[0][0] == '_':
-            self.prop_offset = 1
-        self.packed_header = self.pack_header(header)
-        self.binary_size += len(self.packed_header)
-        return expected_col_count
-
-    def process_entities(self, expected_col_count):
+    def process_entities(self):
         entities_created = 0
         with click.progressbar(self.reader, length=self.entities_count, label=self.entity_str) as reader:
             for row in reader:
-                self.validate_row(expected_col_count, row)
+                self.validate_row(row)
                 # Add identifier->ID pair to dictionary if we are building relations
                 if module_vars.NODE_DICT is not None:
                     if row[0] in module_vars.NODE_DICT:
@@ -69,4 +47,5 @@ def process_entities(self, expected_col_count):
                 self.binary_size += row_binary_len
                 self.binary_entities.append(row_binary)
             module_vars.QUERY_BUF.labels.append(self.to_binary())
+        self.infile.close()
         print("%d nodes created with label '%s'" % (entities_created, self.entity_str))
diff --git a/bulk_insert/module_vars.py b/bulk_insert/module_vars.py
@@ -6,4 +6,3 @@
 TOP_NODE_ID = 0        # next ID to assign to a node
 QUERY_BUF = None       # Buffer for query being constructed
 QUOTING = None
-FIELD_TYPES = None
diff --git a/bulk_insert/relation_type.py b/bulk_insert/relation_type.py
diff --git a/bulk_insert/schema.py b/bulk_insert/schema.py