Commit 5fadb311 authored by Swaroop Vattam's avatar Swaroop Vattam
Browse files

updated validate.py script

parent 10188a0b
Pipeline #40 passed with stage
in 147 minutes and 50 seconds
......@@ -745,11 +745,11 @@ def validate_dataset(dataset_description_path, dataset_description):
return error
def validate_dataset_description(dataset_description_path, known_dataset_descriptions, *, strict_naming=True):
def validate_dataset_description(dataset_description_path, known_dataset_descriptions, *, strict_naming=True, allow_duplicate=False):
print("Validating dataset '{dataset_description_path}'.".format(dataset_description_path=dataset_description_path))
try:
with open(dataset_description_path) as dataset_description_file:
with open(dataset_description_path, 'r', encoding='utf8') as dataset_description_file:
dataset_description = json.load(dataset_description_file)
if not dataset_description_validator.validate(dataset_description):
......@@ -767,17 +767,18 @@ def validate_dataset_description(dataset_description_path, known_dataset_descrip
dataset_id = dataset_id[:-5] + '_SCORE'
if dataset_id in known_dataset_descriptions:
print("ERROR: Duplicate dataset ID '{dataset_id}': '{first_path}' and '{second_path}'".format(
dataset_id=dataset_id,
first_path=known_dataset_descriptions[dataset_id]['path'],
second_path=dataset_description_path,
))
return True
known_dataset_descriptions[dataset_id] = {
'path': dataset_description_path,
'description': dataset_description,
}
if not allow_duplicate:
print("ERROR: Duplicate dataset ID '{dataset_id}': '{first_path}' and '{second_path}'".format(
dataset_id=dataset_id,
first_path=known_dataset_descriptions[dataset_id]['path'],
second_path=dataset_description_path,
))
return True
else:
known_dataset_descriptions[dataset_id] = {
'path': dataset_description_path,
'description': dataset_description,
}
if validate_dataset_path(dataset_id, dataset_description_path, strict_naming=strict_naming):
return True
......@@ -804,7 +805,7 @@ def validate_problem_description(problem_description_path, known_problem_descrip
print("Validating problem '{problem_description_path}'.".format(problem_description_path=problem_description_path))
try:
with open(problem_description_path) as problem_description_file:
with open(problem_description_path, 'r', encoding='utf8') as problem_description_file:
problem_description = json.load(problem_description_file)
if not problem_description_validator.validate(problem_description):
......@@ -826,7 +827,7 @@ def validate_problem_description(problem_description_path, known_problem_descrip
if data['datasetID'].endswith('_TEST'):
data['datasetID'] = data['datasetID'][:-5] + '_SCORE'
# All problem descriptions show be the same.
# All problem descriptions should be the same.
if problem_id.endswith('_TRAIN') or problem_id.endswith('_TEST') or problem_id.endswith('_SCORE'):
print("ERROR: Invalid problem ID '{problem_id}' in '{problem_description_path}'.".format(
problem_id=problem_id,
......@@ -835,8 +836,8 @@ def validate_problem_description(problem_description_path, known_problem_descrip
return True
if problem_id in known_problem_descriptions:
# Problem descriptions with same ID should have the same content.
if problem_description == known_problem_descriptions[problem_id]['description']:
# Problem descriptions with same ID should have the same canonical content.
if canonical_problem_description(problem_description) == canonical_problem_description(known_problem_descriptions[problem_id]['description']):
known_problem_descriptions[problem_id]['paths'].append(problem_description_path)
else:
print("ERROR: Duplicate problem ID '{problem_id}', but different problem description: {first_paths} and '{second_path}'".format(
......@@ -922,7 +923,7 @@ def validate_column_values(dataset_description_path, data_resource, column_index
return error
def validate_target_values(problem_paths, dataset_path, problem_description, data_resource, target):
def validate_target_values(problem_path, dataset_path, problem_description, data_resource, target):
error = False
data_path = os.path.join(os.path.dirname(dataset_path), data_resource['resPath'])
......@@ -941,15 +942,15 @@ def validate_target_values(problem_paths, dataset_path, problem_description, dat
if 'binary' in task_keywords:
if number_distinct_values != 2:
print("ERROR: Problem {problem_paths} has 'binary' keyword, but target column does not have 2 distinct values, but {number_distinct_values}.".format(
problem_paths=problem_paths,
print("ERROR: Problem {problem_path} has 'binary' keyword, but target column does not have 2 distinct values, but {number_distinct_values}.".format(
problem_path=problem_path,
number_distinct_values=number_distinct_values,
))
error = True
elif 'multiClass' in task_keywords:
if number_distinct_values < 3:
print("ERROR: Problem {problem_paths} has 'multiClass' keyword, but target column does not have more than 2 distinct values, but {number_distinct_values}.".format(
problem_paths=problem_paths,
print("ERROR: Problem {problem_path} has 'multiClass' keyword, but target column does not have more than 2 distinct values, but {number_distinct_values}.".format(
problem_path=problem_path,
number_distinct_values=number_distinct_values,
))
error = True
......@@ -957,15 +958,15 @@ def validate_target_values(problem_paths, dataset_path, problem_description, dat
for metric in problem_description.get('inputs', {}).get('performanceMetrics', []):
if metric['metric'] in ['f1', 'precision', 'recall', 'jaccardSimilarityScore']:
if number_distinct_values != 2:
print("ERROR: Problem {problem_paths} uses '{metric}' metric, but target column does not have 2 distinct values, but {number_distinct_values}.".format(
problem_paths=problem_paths,
print("ERROR: Problem {problem_path} uses '{metric}' metric, but target column does not have 2 distinct values, but {number_distinct_values}.".format(
problem_path=problem_path,
metric=metric['metric'],
number_distinct_values=number_distinct_values,
))
error = True
if 'posLabel' in metric and metric['posLabel'] not in distinct_values:
print("ERROR: Problem {problem_paths} provides 'posLabel' for metric '{metric}' with value '{value}', but possible values are: {distinct_values}".format(
problem_paths=problem_paths,
print("ERROR: Problem {problem_path} provides 'posLabel' for metric '{metric}' with value '{value}', but possible values are: {distinct_values}".format(
problem_path=problem_path,
metric=metric['metric'],
value=metric['posLabel'],
distinct_values=sorted(distinct_values),
......@@ -973,13 +974,13 @@ def validate_target_values(problem_paths, dataset_path, problem_description, dat
error = True
if has_missing_values and not task_keywords & {'semiSupervised', 'clustering'}:
print("ERROR: Problem {problem_paths} has target column with missing values, but it not a semi-supervised or clustering task.".format(
problem_paths=problem_paths,
print("ERROR: Problem {problem_path} has target column with missing values, but it not a semi-supervised or clustering task.".format(
problem_path=problem_path,
))
error = True
if 'semiSupervised' in task_keywords and not has_missing_values:
print("ERROR: Problem {problem_paths} is a semi-supervised task, but does not have a target column with missing values.".format(
problem_paths=problem_paths,
print("ERROR: Problem {problem_path} is a semi-supervised task, but does not have a target column with missing values.".format(
problem_path=problem_path,
))
error = True
......@@ -1038,7 +1039,7 @@ def get_all_columns(dataset_path, resource_id, data_resource):
return data_columns
def validate_target(problem_paths, dataset_path, problem_description, dataset_description, target, check_target_values):
def validate_target(problem_path, dataset_path, problem_description, dataset_description, target, check_target_values):
error = False
try:
......@@ -1048,8 +1049,8 @@ def validate_target(problem_paths, dataset_path, problem_description, dataset_de
for column in columns:
if target['colName'] == column['colName'] or target['colIndex'] == column['colIndex']:
if not (target['colName'] == column['colName'] and target['colIndex'] == column['colIndex']):
print("ERROR: Problem {problem_paths} has a target '{target_index}' which does not match a column '{column_index}' in dataset '{dataset_path}' fully.".format(
problem_paths=problem_paths,
print("ERROR: Problem {problem_path} has a target '{target_index}' which does not match a column '{column_index}' in dataset '{dataset_path}' fully.".format(
problem_path=problem_path,
target_index=target['targetIndex'],
column_index=column['colIndex'],
dataset_path=dataset_path,
......@@ -1057,7 +1058,7 @@ def validate_target(problem_paths, dataset_path, problem_description, dataset_de
error = True
if check_target_values:
error = validate_target_values(problem_paths, dataset_path, problem_description, data_resource, target) or error
error = validate_target_values(problem_path, dataset_path, problem_description, data_resource, target) or error
break
else:
......@@ -1073,8 +1074,8 @@ def validate_target(problem_paths, dataset_path, problem_description, dataset_de
))
except (IndexError, KeyError):
print("ERROR: Problem {problem_paths} has target with index '{target_index}' which does not resolve.".format(
problem_paths=problem_paths,
print("ERROR: Problem {problem_path} has target with index '{target_index}' which does not resolve.".format(
problem_path=problem_path,
target_index=target['targetIndex'],
))
return True
......@@ -1098,6 +1099,15 @@ def canonical_dataset_description(dataset_description):
return dataset_description
def canonical_problem_description(problem_description):
problem_description = copy.deepcopy(problem_description)
if 'dataSplits' in problem_description['inputs']:
del problem_description['inputs']['dataSplits']
return problem_description
def datasets_equal(first_dataset_path, second_dataset_path):
if first_dataset_path == second_dataset_path:
return True
......@@ -1124,12 +1134,12 @@ def datasets_equal(first_dataset_path, second_dataset_path):
return True
def validate_dataset_reference(dataset_id, dataset_descriptions, targets, problem_description_value, check_target_values):
def validate_dataset_reference(dataset_id, dataset_descriptions, targets, problem_description, problem_description_path, check_target_values):
error = False
if dataset_id not in dataset_descriptions:
print("ERROR: Problem {problem_paths} is referencing unknown dataset '{dataset_id}'.".format(
problem_paths=problem_description_value['paths'],
print("ERROR: Problem {problem_path} is referencing unknown dataset '{dataset_id}'.".format(
problem_path=problem_description_path,
dataset_id=dataset_id,
))
error = True
......@@ -1138,12 +1148,12 @@ def validate_dataset_reference(dataset_id, dataset_descriptions, targets, proble
dataset_description = dataset_description_value['description']
for i, target in enumerate(targets):
if target['targetIndex'] != i:
print("ERROR: Problem {problem_paths} has target with invalid target index '{target_index}'.".format(
problem_paths=problem_description_value['paths'],
print("ERROR: Problem {problem_path} has target with invalid target index '{target_index}'.".format(
problem_path=problem_description_path,
target_index=target['targetIndex'],
))
error = True
error = validate_target(problem_description_value['paths'], dataset_description_value['path'], problem_description_value['description'], dataset_description, target, check_target_values) or error
error = validate_target(problem_description_path, dataset_description_value['path'], problem_description, dataset_description, target, check_target_values) or error
return error
......@@ -1156,6 +1166,31 @@ def map_dataset_id(dataset_id, dataset_view_map):
raise KeyError("Could not map '{dataset_id}' in dataset view map.".format(dataset_id=dataset_id))
def validate_dataset_view_maps(dataset_descriptions, problem_description_paths, datasetId, targets):
error = False
# Problem descriptions which are otherwise same when compared their canonical versions
# can differ in dataset view maps. So we have to check all dataset view maps available.
for problem_description_path in problem_description_paths:
with open(problem_description_path, 'r', encoding='utf8') as file:
problem_description = json.load(file)
if 'datasetViewMaps' in problem_description.get('inputs', {}).get('dataSplits', {}):
if {'train', 'test', 'score'} != set(problem_description['inputs']['dataSplits']['datasetViewMaps'].keys()):
print("ERROR: Problem {problem_path} has dataset view maps with invalid keys.".format(
problem_path=problem_description_path,
))
error = True
else:
error = validate_dataset_reference(map_dataset_id(datasetId, problem_description['inputs']['dataSplits']['datasetViewMaps']['train']), dataset_descriptions, targets, problem_description, problem_description_path, True) or error
# Test and score splits do not have all values, so we do not validate target values there.
error = validate_dataset_reference(map_dataset_id(datasetId, problem_description['inputs']['dataSplits']['datasetViewMaps']['test']), dataset_descriptions, targets, problem_description, problem_description_path, False) or error
error = validate_dataset_reference(map_dataset_id(datasetId, problem_description['inputs']['dataSplits']['datasetViewMaps']['score']), dataset_descriptions, targets, problem_description, problem_description_path, False) or error
return error
def validate(dataset_descriptions, problem_descriptions):
print("Validating all datasets and problems.")
......@@ -1166,20 +1201,8 @@ def validate(dataset_descriptions, problem_descriptions):
for problem_description_value in problem_descriptions.values():
problem_description = problem_description_value['description']
for data in problem_description.get('inputs', {}).get('data', []):
error = validate_dataset_reference(data['datasetID'], dataset_descriptions, data.get('targets', []), problem_description_value, True) or error
if 'datasetViewMaps' in problem_description.get('inputs', {}).get('dataSplits', {}):
if {'train', 'test', 'score'} != set(problem_description['inputs']['dataSplits']['datasetViewMaps'].keys()):
print("ERROR: Problem {problem_paths} has dataset view maps with invalid keys.".format(
problem_paths=problem_description_value['paths'],
))
error = True
else:
error = validate_dataset_reference(map_dataset_id(data['datasetID'], problem_description['inputs']['dataSplits']['datasetViewMaps']['train']), dataset_descriptions, data.get('targets', []), problem_description_value, True) or error
# Test and score splits do not have all values, so we do not validate target values there.
error = validate_dataset_reference(map_dataset_id(data['datasetID'], problem_description['inputs']['dataSplits']['datasetViewMaps']['test']), dataset_descriptions, data.get('targets', []), problem_description_value, False) or error
error = validate_dataset_reference(map_dataset_id(data['datasetID'], problem_description['inputs']['dataSplits']['datasetViewMaps']['score']), dataset_descriptions, data.get('targets', []), problem_description_value, False) or error
error = validate_dataset_reference(data['datasetID'], dataset_descriptions, data.get('targets', []), problem_description, problem_description_value['paths'], True) or error
error = validate_dataset_view_maps(dataset_descriptions, problem_description_value['paths'], data['datasetID'], data.get('targets', [])) or error
if 'clustering' in problem_description['about']['taskKeywords']:
for data in problem_description.get('inputs', {}).get('data', []):
......@@ -1257,7 +1280,7 @@ def validate(dataset_descriptions, problem_descriptions):
return error
def search_directory(datasets_directory, known_dataset_descriptions, known_problem_descriptions, *, strict_naming=True):
def search_directory(datasets_directory, known_dataset_descriptions, known_problem_descriptions, *, strict_naming=True, allow_duplicate=False):
error = False
datasets_directory = os.path.abspath(datasets_directory)
......@@ -1270,7 +1293,7 @@ def search_directory(datasets_directory, known_dataset_descriptions, known_probl
dataset_description_path = os.path.join(dirpath, 'datasetDoc.json')
error = validate_dataset_description(dataset_description_path, known_dataset_descriptions, strict_naming=strict_naming) or error
error = validate_dataset_description(dataset_description_path, known_dataset_descriptions, strict_naming=strict_naming, allow_duplicate=allow_duplicate) or error
if 'problemDoc.json' in filenames:
# We continue traversing further in this case.
......@@ -1288,6 +1311,11 @@ def configure_parser(parser: argparse.ArgumentParser, *, skip_arguments=()):
'-n', '--no-strict-naming', default=True, action='store_false', dest='strict_naming',
help="do not require strict naming convention",
)
if 'allow_duplicate' not in skip_arguments:
parser.add_argument(
'-d', '--allow-duplicate', default=False, action='store_true', dest='allow_duplicate',
help="allow duplicate datasets",
)
if 'directories' not in skip_arguments:
parser.add_argument(
'directories', metavar='DIR', nargs='*', default=['.'],
......@@ -1302,7 +1330,7 @@ def handler(arguments):
known_problem_descriptions = {}
for datasets_directory in arguments.directories:
error = search_directory(datasets_directory, known_dataset_descriptions, known_problem_descriptions, strict_naming=arguments.strict_naming) or error
error = search_directory(datasets_directory, known_dataset_descriptions, known_problem_descriptions, strict_naming=arguments.strict_naming, allow_duplicate=arguments.allow_duplicate) or error
error = validate(known_dataset_descriptions, known_problem_descriptions) or error
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment