Commit 358899e1 authored by Swaroop Vattam's avatar Swaroop Vattam
Browse files

updated the housekeeping files

parent 89854d58
DATA_SUPPLY_COMMIT: ed9c9ebb878f1a3cf7508e8761192c9e8084e25b
DATA_SUPPLY_COMMIT: 5b3ec32976bd12067c2841a5fd08de93fd3e3e9c
# Run full validation of both the repository and datasets, but just in the repository itself.
......@@ -55,8 +55,6 @@
# same resource, only one each. It should have additional two column roles for direction
# and simple/multi. Those should match between columns (so both should be directed or not,
# and simple or multi, but not mix).
# - When there is "multiIndex" column, all rows for same index value should have the same
# values in all columns except "suggestedTarget" columns.
# - Makes sure that "columnsCount" matches the number of columns, when it exists.
# - For many fields it makes sure that only standard values for them are used.
......@@ -432,40 +430,6 @@ def validate_collection(dataset_description_path, data_resource):
return error
def validate_multi_index(dataset_description_path, data_resource, multi_index_column):
error = False
suggested_target_columns = []
for column_description in data_resource['columns']:
if 'suggestedTarget' in column_description['role']:
data_path = os.path.join(os.path.dirname(dataset_description_path), data_resource['resPath'])
data = read_csv(data_path)
attribute_columns = [column_index for column_index in range(len(data.columns)) if column_index != multi_index_column and column_index not in suggested_target_columns]
attributes = data.iloc[:, attribute_columns].set_index(data.iloc[:, multi_index_column])
count = 0
for group_name, group in attributes.groupby(level=0):
# The first row in a group is not marked, so we add 1 to number of duplicated rows.
if group.duplicated(keep='first').sum() + 1 != len(group):
count += 1
print("ERROR: Dataset '{dataset_path}' has a multi-index resource '{resource_id}' with all attributes in rows not equal for index value '{value}'.".format(
error = True
if LIMIT_OUTPUT is not None and count > LIMIT_OUTPUT:
return error
def validate_edgelist(dataset_description_path, data_resource):
error = False
......@@ -748,9 +712,6 @@ def validate_dataset(dataset_description_path, dataset_description):
if edgelist_columns:
error = validate_edgelist(dataset_description_path, data_resource) or error
if len(multi_index_columns) == 1:
error = validate_multi_index(dataset_description_path, data_resource, multi_index_columns[0]) or error
for res_format in data_resource['resFormat'].keys():
if res_format not in res_format_to_extensions:
print("ERROR: Dataset '{dataset_path}' has a resource '{resource_id}' with unsupported format: {res_format}".format(
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment