Commit 89854d58 authored by Swaroop Vattam's avatar Swaroop Vattam
Browse files

Merge branch 'sync-datasets' into 'master'

Sync datasets

See merge request !2
parents 6420b4ce 8a06c974
Pipeline #32 passed with stage
in 112 minutes and 38 seconds
......@@ -42840,3 +42840,23 @@ seed_datasets_current/32_fma_MIN_METADATA/TRAIN/dataset_TRAIN/media/127350.mp3 f
seed_datasets_current/32_fma_MIN_METADATA/TRAIN/dataset_TRAIN/media/127996.mp3 filter=lfs diff=lfs merge=lfs -text
seed_datasets_current/32_fma_MIN_METADATA/TRAIN/dataset_TRAIN/media/123342.mp3 filter=lfs diff=lfs merge=lfs -text
seed_datasets_current/32_fma_MIN_METADATA/TRAIN/dataset_TRAIN/media/126405.mp3 filter=lfs diff=lfs merge=lfs -text
training_datasets/seed_datasets_archive/1491_one_hundred_plants_margin_clust/SCORE/dataset_TEST/tables/learningData.csv filter=lfs diff=lfs merge=lfs -text
training_datasets/seed_datasets_archive/1491_one_hundred_plants_margin_clust/1491_one_hundred_plants_margin_clust_dataset/tables/learningData.csv filter=lfs diff=lfs merge=lfs -text
training_datasets/seed_datasets_archive/1491_one_hundred_plants_margin_clust/TRAIN/dataset_TRAIN/tables/learningData.csv filter=lfs diff=lfs merge=lfs -text
training_datasets/seed_datasets_archive/1491_one_hundred_plants_margin_clust/TEST/dataset_TEST/tables/learningData.csv filter=lfs diff=lfs merge=lfs -text
seed_datasets_current/LL1_ElectricDevices_MIN_METADATA/LL1_ElectricDevices_MIN_METADATA_problem/dataSplits.csv filter=lfs diff=lfs merge=lfs -text
seed_datasets_current/LL1_ElectricDevices_MIN_METADATA/LL1_ElectricDevices_MIN_METADATA_dataset/tables/learningData.csv filter=lfs diff=lfs merge=lfs -text
seed_datasets_current/LL1_ElectricDevices_MIN_METADATA/SCORE/dataset_SCORE/tables/learningData.csv filter=lfs diff=lfs merge=lfs -text
seed_datasets_current/LL1_ElectricDevices_MIN_METADATA/SCORE/problem_SCORE/dataSplits.csv filter=lfs diff=lfs merge=lfs -text
seed_datasets_current/LL1_ElectricDevices_MIN_METADATA/TRAIN/problem_TRAIN/dataSplits.csv filter=lfs diff=lfs merge=lfs -text
seed_datasets_current/LL1_ElectricDevices_MIN_METADATA/TRAIN/dataset_TRAIN/tables/learningData.csv filter=lfs diff=lfs merge=lfs -text
seed_datasets_current/LL1_ElectricDevices_MIN_METADATA/TEST/dataset_TEST/tables/learningData.csv filter=lfs diff=lfs merge=lfs -text
seed_datasets_current/LL1_ElectricDevices_MIN_METADATA/TEST/problem_TEST/dataSplits.csv filter=lfs diff=lfs merge=lfs -text
training_datasets/seed_datasets_archive/LL1_ElectricDevices/SCORE/dataset_SCORE/tables/learningData.csv filter=lfs diff=lfs merge=lfs -text
training_datasets/seed_datasets_archive/LL1_ElectricDevices/SCORE/problem_SCORE/dataSplits.csv filter=lfs diff=lfs merge=lfs -text
training_datasets/seed_datasets_archive/LL1_ElectricDevices/LL1_ElectricDevices_dataset/tables/learningData.csv filter=lfs diff=lfs merge=lfs -text
training_datasets/seed_datasets_archive/LL1_ElectricDevices/LL1_ElectricDevices_problem/dataSplits.csv filter=lfs diff=lfs merge=lfs -text
training_datasets/seed_datasets_archive/LL1_ElectricDevices/TRAIN/problem_TRAIN/dataSplits.csv filter=lfs diff=lfs merge=lfs -text
training_datasets/seed_datasets_archive/LL1_ElectricDevices/TRAIN/dataset_TRAIN/tables/learningData.csv filter=lfs diff=lfs merge=lfs -text
training_datasets/seed_datasets_archive/LL1_ElectricDevices/TEST/dataset_TEST/tables/learningData.csv filter=lfs diff=lfs merge=lfs -text
training_datasets/seed_datasets_archive/LL1_ElectricDevices/TEST/problem_TEST/dataSplits.csv filter=lfs diff=lfs merge=lfs -text
variables:
DATA_SUPPLY_COMMIT: ed9c9ebb878f1a3cf7508e8761192c9e8084e25b
# Run full validation of both the repository and datasets, but just in the repository itself.
test:
stage: build
......@@ -5,13 +9,16 @@ test:
variables:
GIT_STRATEGY: clone
GIT_SUBMODULE_STRATEGY: recursive
# We on purpose do not fetch submodules so that we validate just the current repository.
# This assumes submodules are validated in their own repositories.
GIT_SUBMODULE_STRATEGY: none
before_script:
- "[ ! -f $(git rev-parse --git-dir)/shallow ] || ( echo 'Repository is shallow.' && exit 1 )"
- git lfs fetch --all
- pip3 install cerberus==1.3.1 deep_dircmp==0.1.0
- git clone --recursive https://gitlab.com/datadrivendiscovery/data-supply.git
- git -C data-supply checkout df915cf20a44f948c8ee2aeb3a15e11d130286d9
- git -C data-supply checkout "${DATA_SUPPLY_COMMIT}"
script:
- |
......@@ -41,3 +48,32 @@ test:
fi
fi
- echo "SUCCESS"
# Run just validator of datasets, but also on git submodule.
test_recursive:
stage: build
image: registry.gitlab.com/datadrivendiscovery/images/core:ubuntu-bionic-python36-devel
variables:
GIT_STRATEGY: clone
GIT_SUBMODULE_STRATEGY: recursive
before_script:
- "[ ! -f $(git rev-parse --git-dir)/shallow ] || ( echo 'Repository is shallow.' && exit 1 )"
- git lfs fetch --all
- git submodule foreach --recursive "git lfs fetch --all"
- pip3 install cerberus==1.3.1 deep_dircmp==0.1.0
- git clone --recursive https://gitlab.com/datadrivendiscovery/data-supply.git
- git -C data-supply checkout "${DATA_SUPPLY_COMMIT}"
script:
- |
set -o errexit
echo "Validating datasets."
./validate.py
- echo "SUCCESS"
only:
refs:
- master
......@@ -8,10 +8,10 @@
"source": "OpenML",
"sourceURI": "http://www.openml.org/d/299",
"approximateSize": "",
"datasetVersion": "4.0.0",
"datasetSchemaVersion": "4.0.0",
"datasetVersion": "4.1.0",
"datasetSchemaVersion": "4.1.0",
"redacted": false,
"digest": "84c5470f05abbf1ce79cbc0bc418d7e022e18307b28d44bd87545a61038c9988"
"digest": "2c6c5d25f27965717f0198a83fff69f7385bba9f1700e9ef97ac67df5ed809a4"
},
"dataResources": [
{
......
......@@ -3,8 +3,8 @@
"problemID": "299_libras_move_MIN_METADATA_problem",
"problemName": "libras_move",
"problemDescription": "**Author**: Daniel Baptista Dias, Sarajane Marques Peres, Helton Hideraldo Biscaro \nUniversity of Sao Paulo, School of Art, Sciences and Humanities, Sao Paulo, SP, Brazil \n**Source**: Unknown - November 2008 \n**Please cite**: \n\n### LIBRAS Movement Database\nLIBRAS, acronym of the Portuguese name \"LIngua BRAsileira de Sinais\", is the official brazilian sign language. The dataset (movement_libras) contains 15 classes of 24 instances each, where each class references to a hand movement type in LIBRAS. The hand movement is represented as a bidimensional curve performed by the hand in a period of time. The curves were obtained from videos of hand movements, with the Libras performance from 4 different people, during 2 sessions. Each video corresponds to only one hand movement and has about $7$ seconds. Each video corresponds to a function F in a functions space which is the continual version of the input dataset. In the video pre-processing, a time normalization is carried out selecting 45 frames from each video, in according to an uniform distribution. In each frame, the centroid pixels of the segmented objects (the hand) are found, which compose the discrete version of the curve F with 45 points. All curves are normalized in the unitary space.\nIn order to prepare these movements to be analysed by algorithms, we have carried out a mapping operation, that is, each curve F is mapped in a representation with 90 features, with representing the coordinates of movement. \nEach instance represents 45 points on a bi-dimensional space, which can be plotted in an ordered way (from 1 through 45 as the X coordinate) in order to draw the path of the movement.",
"problemVersion": "4.0.0",
"problemSchemaVersion": "4.0.0",
"problemVersion": "4.1.0",
"problemSchemaVersion": "4.1.0",
"taskKeywords": [
"classification",
"multiClass",
......@@ -51,11 +51,11 @@
},
"performanceMetrics": [
{
"metric": "accuracy"
"metric": "rocAucMacro"
}
]
},
"expectedOutputs": {
"predictionsFile": "predictions.csv"
}
}
\ No newline at end of file
}
# -*- coding: utf-8 -*-
"""
Given the path to a data file (learningData.csv), this script generates a split
and saves it in output directory.
@author: JO21372
"""
import os
import csv
import sys
import numpy as np
import d3m_utils as utils
from collections import Counter
from sklearn.model_selection import train_test_split
DEFAULT_TRAIN_FRACTION = .8
SPLIT_RANDOM_STATE = 42
TYPE_TEST = "TEST"
TYPE_TRAIN = "TRAIN"
def readData(dataFilePath, classKey = 'class', d3mIndexKey = 'd3mIndex'):
""" Returns 3 lists:
1. d3mInds - d3m indices of samples with labels
2. klasses - lables of samples indexed by corresponding elements in 1.
3. missingLabelsInds - d3m indices of samples with missing labels
Note: Indices of samples that do not have a label don't appear in d3mInds.
Arguments:
1. dataFilePath - path to a data file (learningData.csv)
2. classKey (default: 'class') - key of class column
3. d3mIndexKey (default: 'd3mIndex') - key of d3m index column
"""
d3mInds = []
klasses = []
missingLabelInds = []
with open(dataFilePath, mode='r') as csv_file:
csv_reader = csv.DictReader(csv_file)
line_count = 0
for row in csv_reader:
#Print headers
if line_count == 0:
print(f'Column names are {", ".join(row)}')
klass = row[classKey]
d3mIndex = row[d3mIndexKey]
#Check if label is missing and record
if klass.strip() == "":
missingLabelInds.append(d3mIndex)
else:
d3mInds.append(d3mIndex)
klasses.append(klass)
line_count += 1
print(f'Processed {line_count} lines.')
print(f'Number of samples with no label: {len(missingLabelInds)}')
print(f'Num samples with labels: {len(d3mInds)}')
print(f'Num total of samples: {len(d3mInds) + len(missingLabelInds)}')
return d3mInds, klasses, missingLabelInds
def getNTest(nInstances, trainFraction):
return round((1 - trainFraction) * nInstances)
def safeTrainTestSplit(indices, missingLabelInds, labels, doStratify, trainFraction, randState = SPLIT_RANDOM_STATE):
"""
Returns two numpy arrays containing the d3m indices of the train and test samples
respectively.
"""
print(f'Splitting samples into a {trainFraction * 100} % train set.')
## Classes with one samples should be added to the train set after the split has been made.
#Verify whether there are classes with just one sample.
classesWithOneSample = set()
counts = Counter(labels)
#print(counts)
for y, count in counts.items():
if count < 2:
print(f"** WARNING: Dataset contains only 1 sample of class: {y} **")
classesWithOneSample.add(y)
if len(classesWithOneSample) == 0:
filteredIndices = indices
filteredLabels = labels
lonelyIndices = []
else:
filteredIndices = []
filteredLabels = []
lonelyIndices = []
for i in range(len(indices)):
indx = indices[i]
label = labels[i]
if label in classesWithOneSample:
lonelyIndices.append(indx)
else:
filteredIndices.append(indx)
filteredLabels.append(label)
#Get test sample size
nTest = getNTest(len(filteredIndices), trainFraction)
#Stratify?
stratify = None
if doStratify:
stratify = filteredLabels
#Split
print(f'Splitting: random_state = {randState}, test_size = {nTest}, stratify={doStratify}\n')
indxTrain, indxTest = train_test_split(np.array(filteredIndices), test_size=nTest, random_state=randState, stratify=stratify)
#If samples with missing labels are present in dataset, add them to train set
if len(missingLabelInds) > 0:
print(f"\n** WARNING: Number of missing labels: {len(missingLabelInds)}. Adding those samples to train set.**\n")
indxTrain = np.append(indxTrain, missingLabelInds)
#Add lonely samples to the train set as well
if len(lonelyIndices) > 0:
indxTrain = np.append(indxTrain, lonelyIndices)
return indxTrain, indxTest
def writeDataSplitFile(indxTrain, indxTest, outDir, splitFile = 'dataSplits.csv'):
res = []
for ind in indxTrain:
res.append((ind,TYPE_TRAIN))
for ind in indxTest:
res.append((ind, TYPE_TEST))
outFile = os.path.join(outDir, splitFile)
print(f'Writing split to file {outFile}')
#sort rows
res.sort(key=lambda tup:int(tup[0]))
#Write file
with open(outFile, 'w') as outF:
#Write header
outF.write("d3mIndex,type,repeat,fold\n")
for tup in res:
outF.write(tup[0] + "," + tup[1] + "," + '0,0\n')
def generateSplitForDataset (corporaBaseDir, datasetName):
dataFilePath = os.path.join(corporaBaseDir, datasetName, datasetName + "_dataset", 'tables', 'learningData.csv')
outDir = utils.getProblemDir(corporaBaseDir, datasetName)
testRatio, doStratify, randomSeed, splitsFile, classColName = utils.getSplitParameters(corporaBaseDir, datasetName)
d3mInds, labels, missingLabelInds = readData(dataFilePath, classKey=classColName)
indxTrain, indxTest = safeTrainTestSplit(d3mInds, missingLabelInds, labels, doStratify, 1 - testRatio, randomSeed)
writeDataSplitFile(indxTrain, indxTest, outDir, splitsFile)
#Report
#print(indxTrain)
#print(indxTest)
totalNumSamples = len(indxTrain) + len(indxTest)
numTrainSamples = len(indxTrain)
numTestSamples = len(indxTest)
print(f"Num of train samples: {numTrainSamples} ({numTrainSamples/totalNumSamples * 100}%)")
print(f"Num of test samples: {numTestSamples} ({numTestSamples/totalNumSamples * 100}%)")
if __name__ == '__main__':
"""
corporaBaseDir - directory where corpora is.
datasetListFile - path to file containing names of datasets to be processed.
"""
if len(sys.argv) != 3:
print("Usage: python splitData.py <corporaBaseDir> <datasetListFile>")
sys.exit()
corporaBaseDir = sys.argv[1]
datasetListFile = sys.argv[2]
datasets = utils.readListFromFile(datasetListFile)
for ds in datasets:
print(f'\n\nProcessing dataset {ds}')
generateSplitForDataset(corporaBaseDir, ds)
# -*- coding: utf-8 -*-
"""
Given the path to a data file (learningData.csv), this script generates a split
and saves it in output directory.
@author: JO21372
"""
import os
import csv
import sys
import numpy as np
import d3m_utils as utils
from collections import Counter
from sklearn.model_selection import train_test_split
DEFAULT_TRAIN_FRACTION = .8
SPLIT_RANDOM_STATE = 42
TYPE_TEST = "TEST"
TYPE_TRAIN = "TRAIN"
def readData(dataFilePath, classKey = 'class', d3mIndexKey = 'd3mIndex'):
""" Returns 3 lists:
1. d3mInds - d3m indices of samples with labels
2. klasses - lables of samples indexed by corresponding elements in 1.
3. missingLabelsInds - d3m indices of samples with missing labels
Note: Indices of samples that do not have a label don't appear in d3mInds.
Arguments:
1. dataFilePath - path to a data file (learningData.csv)
2. classKey (default: 'class') - key of class column
3. d3mIndexKey (default: 'd3mIndex') - key of d3m index column
"""
d3mInds = []
klasses = []
missingLabelInds = []
with open(dataFilePath, mode='r') as csv_file:
csv_reader = csv.DictReader(csv_file)
line_count = 0
for row in csv_reader:
#Print headers
if line_count == 0:
print(f'Column names are {", ".join(row)}')
klass = row[classKey]
d3mIndex = row[d3mIndexKey]
#Check if label is missing and record
if klass.strip() == "":
missingLabelInds.append(d3mIndex)
else:
d3mInds.append(d3mIndex)
klasses.append(klass)
line_count += 1
print(f'Processed {line_count} lines.')
print(f'Number of samples with no label: {len(missingLabelInds)}')
print(f'Num samples with labels: {len(d3mInds)}')
print(f'Num total of samples: {len(d3mInds) + len(missingLabelInds)}')
return d3mInds, klasses, missingLabelInds
def getNTest(nInstances, trainFraction):
return round((1 - trainFraction) * nInstances)
def safeTrainTestSplit(indices, missingLabelInds, labels, doStratify, trainFraction, randState = SPLIT_RANDOM_STATE):
"""
Returns two numpy arrays containing the d3m indices of the train and test samples
respectively.
"""
print(f'Splitting samples into a {trainFraction * 100} % train set.')
## Classes with one samples should be added to the train set after the split has been made.
#Verify whether there are classes with just one sample.
classesWithOneSample = set()
counts = Counter(labels)
#print(counts)
for y, count in counts.items():
if count < 2:
print(f"** WARNING: Dataset contains only 1 sample of class: {y} **")
classesWithOneSample.add(y)
if len(classesWithOneSample) == 0:
filteredIndices = indices
filteredLabels = labels
lonelyIndices = []
else:
filteredIndices = []
filteredLabels = []
lonelyIndices = []
for i in range(len(indices)):
indx = indices[i]
label = labels[i]
if label in classesWithOneSample:
lonelyIndices.append(indx)
else:
filteredIndices.append(indx)
filteredLabels.append(label)
#Get test sample size
nTest = getNTest(len(filteredIndices), trainFraction)
#Stratify?
stratify = None
if doStratify:
stratify = filteredLabels
#Split
print(f'Splitting: random_state = {randState}, test_size = {nTest}, stratify={doStratify}\n')
indxTrain, indxTest = train_test_split(np.array(filteredIndices), test_size=nTest, random_state=randState, stratify=stratify)
#If samples with missing labels are present in dataset, add them to train set
if len(missingLabelInds) > 0:
print(f"\n** WARNING: Number of missing labels: {len(missingLabelInds)}. Adding those samples to train set.**\n")
indxTrain = np.append(indxTrain, missingLabelInds)
#Add lonely samples to the train set as well
if len(lonelyIndices) > 0:
indxTrain = np.append(indxTrain, lonelyIndices)
return indxTrain, indxTest
def writeDataSplitFile(indxTrain, indxTest, outDir, splitFile = 'dataSplits.csv'):
res = []
for ind in indxTrain:
res.append((ind,TYPE_TRAIN))
for ind in indxTest:
res.append((ind, TYPE_TEST))
outFile = os.path.join(outDir, splitFile)
print(f'Writing split to file {outFile}')
#sort rows
res.sort(key=lambda tup:int(tup[0]))
#Write file
with open(outFile, 'w') as outF:
#Write header
outF.write("d3mIndex,type,repeat,fold\n")
for tup in res:
outF.write(tup[0] + "," + tup[1] + "," + '0,0\n')
def generateSplitForDataset (corporaBaseDir, datasetName):
dataFilePath = os.path.join(corporaBaseDir, datasetName, datasetName + "_dataset", 'tables', 'learningData.csv')
outDir = utils.getProblemDir(corporaBaseDir, datasetName)
testRatio, doStratify, randomSeed, splitsFile, classColName = utils.getSplitParameters(corporaBaseDir, datasetName)
d3mInds, labels, missingLabelInds = readData(dataFilePath, classKey=classColName)
indxTrain, indxTest = safeTrainTestSplit(d3mInds, missingLabelInds, labels, doStratify, 1 - testRatio, randomSeed)
writeDataSplitFile(indxTrain, indxTest, outDir, splitsFile)
#Report
#print(indxTrain)
#print(indxTest)
totalNumSamples = len(indxTrain) + len(indxTest)
numTrainSamples = len(indxTrain)
numTestSamples = len(indxTest)
print(f"Num of train samples: {numTrainSamples} ({numTrainSamples/totalNumSamples * 100}%)")
print(f"Num of test samples: {numTestSamples} ({numTestSamples/totalNumSamples * 100}%)")
if __name__ == '__main__':
"""
corporaBaseDir - directory where corpora is.
datasetListFile - path to file containing names of datasets to be processed.
"""
if len(sys.argv) != 3:
print("Usage: python splitData.py <corporaBaseDir> <datasetListFile>")
sys.exit()
corporaBaseDir = sys.argv[1]
datasetListFile = sys.argv[2]
datasets = utils.readListFromFile(datasetListFile)
for ds in datasets:
print(f'\n\nProcessing dataset {ds}')
generateSplitForDataset(corporaBaseDir, ds)
......@@ -3,8 +3,8 @@
"problemID": "299_libras_move_MIN_METADATA_problem",
"problemName": "libras_move",
"problemDescription": "**Author**: Daniel Baptista Dias, Sarajane Marques Peres, Helton Hideraldo Biscaro \nUniversity of Sao Paulo, School of Art, Sciences and Humanities, Sao Paulo, SP, Brazil \n**Source**: Unknown - November 2008 \n**Please cite**: \n\n### LIBRAS Movement Database\nLIBRAS, acronym of the Portuguese name \"LIngua BRAsileira de Sinais\", is the official brazilian sign language. The dataset (movement_libras) contains 15 classes of 24 instances each, where each class references to a hand movement type in LIBRAS. The hand movement is represented as a bidimensional curve performed by the hand in a period of time. The curves were obtained from videos of hand movements, with the Libras performance from 4 different people, during 2 sessions. Each video corresponds to only one hand movement and has about $7$ seconds. Each video corresponds to a function F in a functions space which is the continual version of the input dataset. In the video pre-processing, a time normalization is carried out selecting 45 frames from each video, in according to an uniform distribution. In each frame, the centroid pixels of the segmented objects (the hand) are found, which compose the discrete version of the curve F with 45 points. All curves are normalized in the unitary space.\nIn order to prepare these movements to be analysed by algorithms, we have carried out a mapping operation, that is, each curve F is mapped in a representation with 90 features, with representing the coordinates of movement. \nEach instance represents 45 points on a bi-dimensional space, which can be plotted in an ordered way (from 1 through 45 as the X coordinate) in order to draw the path of the movement.",
"problemVersion": "4.0.0",
"problemSchemaVersion": "4.0.0",
"problemVersion": "4.1.0",
"problemSchemaVersion": "4.1.0",
"taskKeywords": [
"classification",
"multiClass",
......@@ -51,11 +51,11 @@
},
"performanceMetrics": [
{
"metric": "accuracy"
"metric": "rocAucMacro"
}
]
},
"expectedOutputs": {
"predictionsFile": "predictions.csv"
}
}
\ No newline at end of file
}
# -*- coding: utf-8 -*-
"""
Given the path to a data file (learningData.csv), this script generates a split
and saves it in output directory.
@author: JO21372
"""
import os
import csv
import sys
import numpy as np
import d3m_utils as utils
from collections import Counter
from sklearn.model_selection import train_test_split
DEFAULT_TRAIN_FRACTION = .8
SPLIT_RANDOM_STATE = 42
TYPE_TEST = "TEST"
TYPE_TRAIN = "TRAIN"
def readData(dataFilePath, classKey = 'class', d3mIndexKey = 'd3mIndex'):
""" Returns 3 lists:
1. d3mInds - d3m indices of samples with labels
2. klasses - lables of samples indexed by corresponding elements in 1.
3. missingLabelsInds - d3m indices of samples with missing labels
Note: Indices of samples that do not have a label don't appear in d3mInds.
Arguments:
1. dataFilePath - path to a data file (learningData.csv)
2. classKey (default: 'class') - key of class column
3. d3mIndexKey (default: 'd3mIndex') - key of d3m index column
"""
d3mInds = []
klasses = []
missingLabelInds = []
with open(dataFilePath, mode='r') as csv_file:
csv_reader = csv.DictReader(csv_file)
line_count = 0
for row in csv_reader:
#Print headers
if line_count == 0:
print(f'Column names are {", ".join(row)}')
klass = row[classKey]
d3mIndex = row[d3mIndexKey]
#Check if label is missing and record
if klass.strip() == "":