|
1 | 1 | import os
|
2 | 2 | import io
|
3 | 3 | import csv
|
| 4 | +import math |
4 | 5 | import struct
|
5 | 6 | import module_vars
|
6 |
| -from exceptions import CSVError |
7 |
| -import schema |
| 7 | +import configs |
| 8 | +from exceptions import CSVError, SchemaError |
| 9 | +from schema import Type, convert_schema_type |
8 | 10 |
|
9 | 11 |
|
10 | 12 | # Convert a single CSV property field into a binary stream.
|
11 | 13 | # Supported property types are string, numeric, boolean, and NULL.
|
12 | 14 | # type is either Type.DOUBLE, Type.BOOL or Type.STRING, and explicitly sets the value to this type if possible
|
13 |
| -def prop_to_binary(prop_val, type): |
14 |
| - # All format strings start with an unsigned char to represent our Type enum |
| 15 | +def prop_to_binary(prop_val, prop_type): |
| 16 | + # All format strings start with an unsigned char to represent our prop_type enum |
15 | 17 | format_str = "=B"
|
16 | 18 | if prop_val is None:
|
17 | 19 | # An empty field indicates a NULL property
|
18 | 20 | return struct.pack(format_str, Type.NULL)
|
19 | 21 |
|
20 | 22 | # If field can be cast to a float, allow it
|
21 |
| - if type == None or type == Type.DOUBLE: |
| 23 | + if prop_type is None or prop_type == Type.DOUBLE: |
22 | 24 | try:
|
23 | 25 | numeric_prop = float(prop_val)
|
24 | 26 | if not math.isnan(numeric_prop) and not math.isinf(numeric_prop): # Don't accept non-finite values.
|
25 | 27 | return struct.pack(format_str + "d", Type.DOUBLE, numeric_prop)
|
26 | 28 | except:
|
27 |
| - pass |
| 29 | + raise SchemaError("Could not parse '%s' as a double" % prop_val) |
28 | 30 |
|
29 |
| - if type == None or type == Type.BOOL: |
| 31 | + if prop_type is None or prop_type == Type.BOOL: |
30 | 32 | # If field is 'false' or 'true', it is a boolean
|
31 | 33 | if prop_val.lower() == 'false':
|
32 | 34 | return struct.pack(format_str + '?', Type.BOOL, False)
|
33 | 35 | elif prop_val.lower() == 'true':
|
34 | 36 | return struct.pack(format_str + '?', Type.BOOL, True)
|
35 | 37 |
|
36 |
| - if type == None or type == Type.STRING: |
| 38 | + if prop_type is None or prop_type == Type.STRING: |
37 | 39 | # If we've reached this point, the property is a string
|
38 | 40 | encoded_str = str.encode(prop_val) # struct.pack requires bytes objects as arguments
|
39 | 41 | # Encoding len+1 adds a null terminator to the string
|
40 | 42 | format_str += "%ds" % (len(encoded_str) + 1)
|
41 |
| - return struct.pack(format_str, schema.Type.STRING, encoded_str) |
| 43 | + return struct.pack(format_str, Type.STRING, encoded_str) |
42 | 44 |
|
| 45 | + if prop_type in (Type.LABEL, Type.TYPE, Type.ID): # TODO tmp, treat as string for testing |
| 46 | + encoded_str = str.encode(prop_val) # struct.pack requires bytes objects as arguments |
| 47 | + # Encoding len+1 adds a null terminator to the string |
| 48 | + format_str += "%ds" % (len(encoded_str) + 1) |
| 49 | + return struct.pack(format_str, Type.STRING, encoded_str) |
| 50 | + |
| 51 | + import ipdb |
| 52 | + ipdb.set_trace() |
43 | 53 | # If it hasn't returned by this point, it is trying to set it to a type that it can't adopt
|
44 | 54 | raise Exception("unable to parse [" + prop_val + "] with type ["+repr(type)+"]")
|
45 | 55 |
|
46 | 56 |
|
47 | 57 | # Superclass for label and relation CSV files
|
48 | 58 | class EntityFile(object):
|
49 |
| - def __init__(self, filename, separator): |
| 59 | + def __init__(self, filename): |
50 | 60 | # The label or relation type string is the basename of the file
|
51 | 61 | self.entity_str = os.path.splitext(os.path.basename(filename))[0]
|
52 | 62 | # Input file handling
|
53 | 63 | self.infile = io.open(filename, 'rt')
|
54 | 64 | # Initialize CSV reader that ignores leading whitespace in each field
|
55 | 65 | # and does not modify input quote characters
|
56 |
| - self.reader = csv.reader(self.infile, delimiter=separator, skipinitialspace=True, quoting=module_vars.QUOTING) |
57 |
| - |
58 |
| - self.prop_offset = 0 # Starting index of properties in row |
59 |
| - self.prop_count = 0 # Number of properties per entity |
| 66 | + self.reader = csv.reader(self.infile, delimiter=module_vars.CONFIGS.separator, skipinitialspace=True, quoting=module_vars.QUOTING) |
60 | 67 |
|
61 | 68 | self.packed_header = b''
|
62 | 69 | self.binary_entities = []
|
63 | 70 | self.binary_size = 0 # size of binary token
|
| 71 | + |
| 72 | + # Extract data from header row. |
| 73 | + self.convert_header() |
| 74 | + |
64 | 75 | self.count_entities() # number of entities/row in file.
|
| 76 | + next(self.reader) # Skip header for next read. |
65 | 77 |
|
66 | 78 | # Count number of rows in file.
|
67 | 79 | def count_entities(self):
|
68 | 80 | self.entities_count = 0
|
69 | 81 | self.entities_count = sum(1 for line in self.infile)
|
70 |
| - # discard header row |
71 |
| - self.entities_count -= 1 |
72 | 82 | # seek back
|
73 | 83 | self.infile.seek(0)
|
74 | 84 | return self.entities_count
|
75 | 85 |
|
76 | 86 | # Simple input validations for each row of a CSV file
|
77 |
| - def validate_row(self, expected_col_count, row): |
| 87 | + def validate_row(self, row): |
78 | 88 | # Each row should have the same number of fields
|
79 |
| - if len(row) != expected_col_count: |
| 89 | + if len(row) != self.column_count: |
80 | 90 | raise CSVError("%s:%d Expected %d columns, encountered %d ('%s')"
|
81 |
| - % (self.infile.name, self.reader.line_num, expected_col_count, len(row), ','.join(row))) |
| 91 | + % (self.infile.name, self.reader.line_num, self.column_count, len(row), configs.separator.join(row))) |
82 | 92 |
|
83 | 93 | # If part of a CSV file was sent to Redis, delete the processed entities and update the binary size
|
84 | 94 | def reset_partial_binary(self):
|
85 | 95 | self.binary_entities = []
|
86 | 96 | self.binary_size = len(self.packed_header)
|
87 | 97 |
|
88 | 98 | # Convert property keys from a CSV file header into a binary string
|
89 |
| - def pack_header(self, header): |
90 |
| - prop_count = len(header) - self.prop_offset |
| 99 | + def pack_header(self): |
91 | 100 | # String format
|
92 | 101 | entity_bytes = self.entity_str.encode()
|
93 | 102 | fmt = "=%dsI" % (len(entity_bytes) + 1) # Unaligned native, entity name, count of properties
|
94 |
| - args = [entity_bytes, prop_count] |
95 |
| - for p in header[self.prop_offset:]: |
96 |
| - prop = p.encode() |
| 103 | + args = [entity_bytes, self.prop_count] |
| 104 | + for idx in range(self.column_count): |
| 105 | + if self.skip_offsets[idx]: |
| 106 | + continue |
| 107 | + prop = self.column_names[idx].encode() |
97 | 108 | fmt += "%ds" % (len(prop) + 1) # encode string with a null terminator
|
98 | 109 | args.append(prop)
|
99 | 110 | return struct.pack(fmt, *args)
|
100 | 111 |
|
| 112 | + # Extract column names and types from a header row |
| 113 | + def convert_header(self): |
| 114 | + header = next(self.reader) |
| 115 | + self.column_count = len(header) |
| 116 | + self.column_names = [None] * self.column_count # Property names of every column. |
| 117 | + self.types = [None] * self.column_count # Value type of every column. |
| 118 | + self.skip_offsets = [False] * self.column_count # Whether column at any offset should not be stored as a property. |
| 119 | + |
| 120 | + for idx, field in enumerate(header): |
| 121 | + pair = field.split(':') |
| 122 | + if len(pair) > 2: |
| 123 | + raise CSVError("Field '%s' had %d colons" % field, len(field)) |
| 124 | + elif len(pair) < 2: |
| 125 | + self.types[idx] = convert_schema_type(pair[0].casefold()) |
| 126 | + self.skip_offsets[idx] = True |
| 127 | + if self.types[idx] not in (Type.ID, Type.START_ID, Type.END_ID, Type.IGNORE): |
| 128 | + # Any other field should have 2 elements |
| 129 | + raise SchemaError("Each property in the header should be a colon-separated pair") |
| 130 | + else: |
| 131 | + self.column_names[idx] = pair[0] |
| 132 | + self.types[idx] = convert_schema_type(pair[1].casefold()) |
| 133 | + if self.types[idx] in (Type.START_ID, Type.END_ID, Type.IGNORE): |
| 134 | + self.skip_offsets[idx] = True |
| 135 | + |
| 136 | + # The number of properties is equal to the number of non-skipped columns. |
| 137 | + self.prop_count = self.skip_offsets.count(False) |
| 138 | + self.packed_header = self.pack_header() |
| 139 | + self.binary_size += len(self.packed_header) |
| 140 | + |
101 | 141 | # Convert a list of properties into a binary string
|
102 | 142 | def pack_props(self, line):
|
103 | 143 | props = []
|
104 |
| - for num, field in enumerate(line[self.prop_offset:]): |
105 |
| - field_type_idx = self.prop_offset+num |
106 |
| - try: |
107 |
| - module_vars.FIELD_TYPES[self.entity_str][field_type_idx] |
108 |
| - except: |
109 |
| - props.append(prop_to_binary(field, None)) |
110 |
| - else: |
111 |
| - props.append(prop_to_binary(field, module_vars.FIELD_TYPES[self.entity_str][field_type_idx])) |
| 144 | + for idx, field in enumerate(line): |
| 145 | + if self.skip_offsets[idx]: |
| 146 | + continue |
| 147 | + if self.column_names[idx]: |
| 148 | + props.append(prop_to_binary(field, self.types[idx])) |
112 | 149 | return b''.join(p for p in props)
|
113 | 150 |
|
114 | 151 | def to_binary(self):
|
|
0 commit comments