Commit 8a06c974 authored by Swaroop Vattam's avatar Swaroop Vattam

synced scripts

parent 41dcd676
Pipeline #31 passed with stage
in 96 minutes and 18 seconds
variables:
DATA_SUPPLY_COMMIT: ed9c9ebb878f1a3cf7508e8761192c9e8084e25b
# Run full validation of both the repository and datasets, but just in the repository itself.
test:
stage: build
......@@ -5,13 +9,16 @@ test:
variables:
GIT_STRATEGY: clone
GIT_SUBMODULE_STRATEGY: recursive
# We on purpose do not fetch submodules so that we validate just the current repository.
# This assumes submodules are validated in their own repositories.
GIT_SUBMODULE_STRATEGY: none
before_script:
- "[ ! -f $(git rev-parse --git-dir)/shallow ] || ( echo 'Repository is shallow.' && exit 1 )"
- git lfs fetch --all
- pip3 install cerberus==1.3.1 deep_dircmp==0.1.0
- git clone --recursive https://gitlab.com/datadrivendiscovery/data-supply.git
- git -C data-supply checkout df915cf20a44f948c8ee2aeb3a15e11d130286d9
- git -C data-supply checkout "${DATA_SUPPLY_COMMIT}"
script:
- |
......@@ -41,3 +48,32 @@ test:
fi
fi
- echo "SUCCESS"
# Run just validator of datasets, but also on git submodule.
test_recursive:
stage: build
image: registry.gitlab.com/datadrivendiscovery/images/core:ubuntu-bionic-python36-devel
variables:
GIT_STRATEGY: clone
GIT_SUBMODULE_STRATEGY: recursive
before_script:
- "[ ! -f $(git rev-parse --git-dir)/shallow ] || ( echo 'Repository is shallow.' && exit 1 )"
- git lfs fetch --all
- git submodule foreach --recursive "git lfs fetch --all"
- pip3 install cerberus==1.3.1 deep_dircmp==0.1.0
- git clone --recursive https://gitlab.com/datadrivendiscovery/data-supply.git
- git -C data-supply checkout "${DATA_SUPPLY_COMMIT}"
script:
- |
set -o errexit
echo "Validating datasets."
./validate.py
- echo "SUCCESS"
only:
refs:
- master
......@@ -58,6 +58,7 @@
# - When there is "multiIndex" column, all rows for same index value should have the same
# values in all columns except "suggestedTarget" columns.
# - Makes sure that "columnsCount" matches the number of columns, when it exists.
# - For many fields it makes sure that only standard values for them are used.
import argparse
import collections
......@@ -106,6 +107,9 @@ with open(os.path.join(os.path.dirname(__file__), 'data-supply', 'documentation'
else:
res_format_to_extensions[res_format] = sorted(set(extensions) | set(res_format_to_extensions[res_format]))
with open(os.path.join(os.path.dirname(__file__), 'data-supply', 'documentation', 'standardValues.json')) as standard_values_file:
standard_values = json.load(standard_values_file)
@functools.lru_cache(maxsize=10)
def read_csv(data_path):
......@@ -121,6 +125,64 @@ def read_csv(data_path):
)
def validate_standard_values(document):
for key, value in document.items():
if key in standard_values and isinstance(value, (str, list)):
if isinstance(value, str):
value = [value]
for v in value:
if v not in standard_values[key]:
print("ERROR: Field '{key}' has a non-standard value '{v}'.".format(
key=key,
v=v,
))
return True
elif isinstance(value, dict):
if key == 'resObject':
value_keys = set(value.keys())
if len(value_keys) != 1:
print("ERROR: Field '{key}' is a dict with more than one key.".format(
key=key,
))
return True
extra_keys = value_keys - {'nodeAttribute', 'edgeAttribute', 'columnIndex', 'columnName'}
if extra_keys:
print("ERROR: Field '{key}' is a dict with a non-standard key '{extra_key}'.".format(
key=key,
extra_key=extra_keys.pop(),
))
return True
elif key == 'resComponent':
value_keys = set(value.keys())
if len(value_keys) != 1:
print("ERROR: Field '{key}' is a dict with more than one key.".format(
key=key,
))
return True
extra_keys = value_keys - {'columnIndex', 'columnName', 'nodeAttribute', 'edgeAttribute', 'selector'}
if extra_keys:
print("ERROR: Field '{key}' is a dict with a non-standard key '{extra_key}'.".format(
key=key,
extra_key=extra_keys.pop(),
))
return True
elif validate_standard_values(value):
return True
elif isinstance(value, list):
for v in value:
if isinstance(v, dict) and validate_standard_values(v):
return True
return False
def validate_dataset_path(description_id, description_path, *, strict_naming=True):
if os.path.basename(description_path) != 'datasetDoc.json':
print("ERROR: Dataset description filename is not 'datasetDoc.json'.")
......@@ -766,6 +828,9 @@ def validate_dataset_description(dataset_description_path, known_dataset_descrip
if validate_dataset(dataset_description_path, dataset_description):
return True
if validate_standard_values(dataset_description):
return True
except Exception:
print("ERROR: Unexpected exception:")
traceback.print_exc()
......@@ -832,6 +897,9 @@ def validate_problem_description(problem_description_path, known_problem_descrip
))
return True
if validate_standard_values(problem_description):
return True
if validate_metrics(problem_description):
return True
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment