Commit c8289da0 authored by Swaroop Vattam's avatar Swaroop Vattam

migration to version4

parent 031a733c
Pipeline #7 failed with stage
in 23 minutes and 21 seconds
# Public D3M datasets
# Private D3M datasets
This repository contains public D3M datasets.
This repository contains private D3M datasets. **Do not distribute them.**
**Public D3M datasets are available [here](https://datasets.datadrivendiscovery.org/d3m/datasets).**
Please report any issues with private datasets in [data-supply repository](https://gitlab.com/datadrivendiscovery/data-supply/issues).
Datasets schemas and related documentation is available in [data-supply repository](https://gitlab.com/datadrivendiscovery/data-supply).
......@@ -9,15 +13,15 @@ Datasets schemas and related documentation is available in [data-supply reposito
Download datasets using [git LFS](https://git-lfs.github.com/):
```
$ git lfs clone git@datasets.datadrivendiscovery.org:d3m/datasets.git
$ git lfs clone git@gitlab.datadrivendiscovery.org:d3m/datasets.git
```
Note, use `git lfs clone` instead of `git clone` because it
is faster.
This will take time but especially disk space. Currently all
datasets are around 46 GB, but the whole directory with cloned
repository and git metadata is around 65 GB. Running
datasets are around 54 GB, but the whole directory with cloned
repository and git metadata is around 84 GB. Running
`git lfs prune` might help by removing old and unreferenced files.
Repository is organized so that all files larger than 100 KB are
......@@ -31,7 +35,7 @@ It is possible to download only part of the repository. First clone
without downloading files managed by git LFS:
```
$ git lfs clone git@datasets.datadrivendiscovery.org:d3m/datasets.git -X "*"
$ git lfs clone git@gitlab.datadrivendiscovery.org:d3m/datasets.git -X "*"
```
This will download and checkout all files smaller than 100 KB.
......
......@@ -7,28 +7,32 @@
"license": "open",
"source": "USPS",
"sourceURI": "http://www.cad.zju.edu.cn/home/dengcai/Data/MLData.html",
"datasetSchemaVersion": "3.2.0",
"datasetSchemaVersion": "4.0.0",
"redacted": false,
"datasetVersion": "2.0",
"digest": "c75da4fea7a7d4e4c67b9ff8175ab646e2fab606e2006a2d929e3cc8de6d7012"
"datasetVersion": "4.0.0",
"digest": "dc64b78bce3f4a88dfdb0ebd834bbce42b81bb137c8bd4cd152db6777be5d62d"
},
"dataResources": [
{
"resID": "0",
"resPath": "media/",
"resType": "image",
"resFormat": [
"image/png"
],
"resFormat": {
"image/png": [
"png"
]
},
"isCollection": true
},
{
"resID": "learningData",
"resPath": "tables/learningData.csv",
"resType": "table",
"resFormat": [
"text/csv"
],
"resFormat": {
"text/csv": [
"csv"
]
},
"isCollection": false,
"columns": [
{
......
......@@ -3,10 +3,13 @@
"problemID": "124_188_usps_problem",
"problemName": "usps_problem",
"problemDescription": "Multiclass image classification problem. Each image belongs to one of 10 classes.",
"taskType": "classification",
"taskSubType": "multiClass",
"problemSchemaVersion": "3.2.0",
"problemVersion": "2.0"
"problemSchemaVersion": "4.0.0",
"problemVersion": "4.0.0",
"taskKeywords": [
"classification",
"multiClass",
"image"
]
},
"inputs": {
"data": [
......@@ -27,7 +30,27 @@
"testSize": 0.216,
"stratified": false,
"numRepeats": 0,
"splitsFile": "dataSplits.csv"
"splitsFile": "dataSplits.csv",
"datasetViewMaps": {
"train": [
{
"from": "124_188_usps_dataset",
"to": "124_188_usps_dataset_TRAIN"
}
],
"test": [
{
"from": "124_188_usps_dataset",
"to": "124_188_usps_dataset_TEST"
}
],
"score": [
{
"from": "124_188_usps_dataset",
"to": "124_188_usps_dataset_SCORE"
}
]
}
},
"performanceMetrics": [
{
......@@ -38,4 +61,4 @@
"expectedOutputs": {
"predictionsFile": "predictions.csv"
}
}
}
\ No newline at end of file
......@@ -7,28 +7,32 @@
"license": "open",
"source": "USPS",
"sourceURI": "http://www.cad.zju.edu.cn/home/dengcai/Data/MLData.html",
"datasetSchemaVersion": "3.2.0",
"datasetSchemaVersion": "4.0.0",
"redacted": false,
"datasetVersion": "2.0",
"digest": "a8f27ab2e3cb1443d8cfbe2b3a286774ef8012b0c6ba4e41e029f4eda06f20f7"
"datasetVersion": "4.0.0",
"digest": "73d76f80d119e04a0aef61cd014d312df5d786fae76a592ef8c0932f0a509914"
},
"dataResources": [
{
"resID": "0",
"resPath": "media/",
"resType": "image",
"resFormat": [
"image/png"
],
"resFormat": {
"image/png": [
"png"
]
},
"isCollection": true
},
{
"resID": "learningData",
"resPath": "tables/learningData.csv",
"resType": "table",
"resFormat": [
"text/csv"
],
"resFormat": {
"text/csv": [
"csv"
]
},
"isCollection": false,
"columns": [
{
......
{
"about": {
"problemID": "124_188_usps_problem_SCORE",
"problemID": "124_188_usps_problem",
"problemName": "usps_problem",
"problemDescription": "Multiclass image classification problem. Each image belongs to one of 10 classes.",
"taskType": "classification",
"taskSubType": "multiClass",
"problemSchemaVersion": "3.2.0",
"problemVersion": "2.0"
"problemSchemaVersion": "4.0.0",
"problemVersion": "4.0.0",
"taskKeywords": [
"classification",
"multiClass",
"image"
]
},
"inputs": {
"data": [
{
"datasetID": "124_188_usps_dataset_SCORE",
"datasetID": "124_188_usps_dataset",
"targets": [
{
"targetIndex": 0,
......@@ -27,7 +30,27 @@
"testSize": 0.216,
"stratified": false,
"numRepeats": 0,
"splitsFile": "dataSplits.csv"
"splitsFile": "dataSplits.csv",
"datasetViewMaps": {
"train": [
{
"from": "124_188_usps_dataset",
"to": "124_188_usps_dataset_TRAIN"
}
],
"test": [
{
"from": "124_188_usps_dataset",
"to": "124_188_usps_dataset_TEST"
}
],
"score": [
{
"from": "124_188_usps_dataset",
"to": "124_188_usps_dataset_SCORE"
}
]
}
},
"performanceMetrics": [
{
......
......@@ -7,28 +7,32 @@
"license": "open",
"source": "USPS",
"sourceURI": "http://www.cad.zju.edu.cn/home/dengcai/Data/MLData.html",
"datasetSchemaVersion": "3.2.0",
"datasetSchemaVersion": "4.0.0",
"redacted": false,
"datasetVersion": "2.0",
"digest": "6861672270a24f3f199f7175b4904e6a588e1ed7433ed7cb1bdbbdd0225f8274"
"datasetVersion": "4.0.0",
"digest": "2c26e78b65ae0ae3ee6dc2ad6f28eaec138c6c37a23552bd33b3db2461a0652b"
},
"dataResources": [
{
"resID": "0",
"resPath": "media/",
"resType": "image",
"resFormat": [
"image/png"
],
"resFormat": {
"image/png": [
"png"
]
},
"isCollection": true
},
{
"resID": "learningData",
"resPath": "tables/learningData.csv",
"resType": "table",
"resFormat": [
"text/csv"
],
"resFormat": {
"text/csv": [
"csv"
]
},
"isCollection": false,
"columns": [
{
......
{
"about": {
"problemID": "124_188_usps_problem_TEST",
"problemID": "124_188_usps_problem",
"problemName": "usps_problem",
"problemDescription": "Multiclass image classification problem. Each image belongs to one of 10 classes.",
"taskType": "classification",
"taskSubType": "multiClass",
"problemSchemaVersion": "3.2.0",
"problemVersion": "2.0"
"problemSchemaVersion": "4.0.0",
"problemVersion": "4.0.0",
"taskKeywords": [
"classification",
"multiClass",
"image"
]
},
"inputs": {
"data": [
{
"datasetID": "124_188_usps_dataset_TEST",
"datasetID": "124_188_usps_dataset",
"targets": [
{
"targetIndex": 0,
......@@ -27,7 +30,27 @@
"testSize": 0.216,
"stratified": false,
"numRepeats": 0,
"splitsFile": "dataSplits.csv"
"splitsFile": "dataSplits.csv",
"datasetViewMaps": {
"train": [
{
"from": "124_188_usps_dataset",
"to": "124_188_usps_dataset_TRAIN"
}
],
"test": [
{
"from": "124_188_usps_dataset",
"to": "124_188_usps_dataset_TEST"
}
],
"score": [
{
"from": "124_188_usps_dataset",
"to": "124_188_usps_dataset_SCORE"
}
]
}
},
"performanceMetrics": [
{
......
......@@ -7,28 +7,32 @@
"license": "open",
"source": "USPS",
"sourceURI": "http://www.cad.zju.edu.cn/home/dengcai/Data/MLData.html",
"datasetSchemaVersion": "3.2.0",
"datasetSchemaVersion": "4.0.0",
"redacted": false,
"datasetVersion": "2.0",
"digest": "4fdd5af8fca34e57aeaf7fe085c174c1050efeb45f0646670a4134e341fea6c1"
"datasetVersion": "4.0.0",
"digest": "33b71f121dcd1466895e4f83ff20272ae57397582277359a862a7cbade2c4cf4"
},
"dataResources": [
{
"resID": "0",
"resPath": "media/",
"resType": "image",
"resFormat": [
"image/png"
],
"resFormat": {
"image/png": [
"png"
]
},
"isCollection": true
},
{
"resID": "learningData",
"resPath": "tables/learningData.csv",
"resType": "table",
"resFormat": [
"text/csv"
],
"resFormat": {
"text/csv": [
"csv"
]
},
"isCollection": false,
"columns": [
{
......
{
"about": {
"problemID": "124_188_usps_problem_TRAIN",
"problemID": "124_188_usps_problem",
"problemName": "usps_problem",
"problemDescription": "Multiclass image classification problem. Each image belongs to one of 10 classes.",
"taskType": "classification",
"taskSubType": "multiClass",
"problemSchemaVersion": "3.2.0",
"problemVersion": "2.0"
"problemSchemaVersion": "4.0.0",
"problemVersion": "4.0.0",
"taskKeywords": [
"classification",
"multiClass",
"image"
]
},
"inputs": {
"data": [
{
"datasetID": "124_188_usps_dataset_TRAIN",
"datasetID": "124_188_usps_dataset",
"targets": [
{
"targetIndex": 0,
......@@ -27,7 +30,27 @@
"testSize": 0.216,
"stratified": false,
"numRepeats": 0,
"splitsFile": "dataSplits.csv"
"splitsFile": "dataSplits.csv",
"datasetViewMaps": {
"train": [
{
"from": "124_188_usps_dataset",
"to": "124_188_usps_dataset_TRAIN"
}
],
"test": [
{
"from": "124_188_usps_dataset",
"to": "124_188_usps_dataset_TEST"
}
],
"score": [
{
"from": "124_188_usps_dataset",
"to": "124_188_usps_dataset_SCORE"
}
]
}
},
"performanceMetrics": [
{
......
......@@ -8,19 +8,21 @@
"source": "OpenML",
"sourceURI": "http://www.openml.org/d/1491",
"approximateSize": "",
"datasetVersion": "2.0",
"datasetSchemaVersion": "3.2.0",
"datasetVersion": "4.0.0",
"datasetSchemaVersion": "4.0.0",
"redacted": false,
"digest": "f0a796e02c16e16e4edf199e7062eeb7f93abc80b1ca92b0cb00828feb5b5ed7"
"digest": "eacc121b204bc03038e24cc691fe1dfe05ca76b99e84867f6240e31a29bf3d57"
},
"dataResources": [
{
"resID": "learningData",
"resPath": "tables/learningData.csv",
"resType": "table",
"resFormat": [
"text/csv"
],
"resFormat": {
"text/csv": [
"csv"
]
},
"isCollection": false,
"columns": [
{
......
......@@ -3,10 +3,13 @@
"problemID": "1491_one_hundred_plants_margin_problem",
"problemName": "one_hundred_plants_margin_problem",
"problemDescription": "**Author**: James Cope, Thibaut Beghin, Paolo Remagnino, Sarah Barman. \n**Source**: [UCI](https://archive.ics.uci.edu/ml/datasets/One-hundred+plant+species+leaves+data+set) - 2010 \n**Please cite**: Charles Mallah, James Cope, James Orwell. Plant Leaf Classification Using Probabilistic Integration of Shape, Texture and Margin Features. Signal Processing, Pattern Recognition and Applications, in press. 2013. \n\n### Description\n\nOne-hundred plant species leaves dataset (Class = Margin).\n \n### Sources\n```\n (a) Original owners of colour Leaves Samples:\n\n James Cope, Thibaut Beghin, Paolo Remagnino, Sarah Barman. \n The colour images are not included. \n The Leaves were collected in the Royal Botanic Gardens, Kew, UK. \n email: james.cope@kingston.ac.uk \n \n (b) This dataset consists of work carried out by James Cope, Charles Mallah, and James Orwell. \n Donor of database Charles Mallah: charles.mallah@kingston.ac.uk; James Cope: james.cope@kingston.ac.uk \n```\n\n### Dataset Information\n\nThe original data directory contains the binary images (masks) of the leaf samples (colour images not included).\nThere are three features for each image: Shape, Margin and Texture.\nFor each feature, a 64 element vector is given per leaf sample.\nThese vectors are taken as a contiguous descriptor (for shape) or histograms (for texture and margin).\nSo, there are three different files, one for each feature problem: \n * 'data_Sha_64.txt' -> prediction based on shape\n * 'data_Tex_64.txt' -> prediction based on texture\n * 'data_Mar_64.txt' -> prediction based on margin [**dataset provided here**] \n\nEach row has a 64-element feature vector followed by the Class label.\nThere is a total of 1600 samples with 16 samples per leaf class (100 classes), and no missing values.\n\n### Attributes Information\n\nThree 64 element feature vectors per sample.\n\n### Relevant Papers\n\nCharles Mallah, James Cope, James Orwell. \nPlant Leaf Classification Using Probabilistic Integration of Shape, Texture and Margin Features. \nSignal Processing, Pattern Recognition and Applications, in press.\n\nJ. Cope, P. Remagnino, S. Barman, and P. Wilkin.\nPlant texture classification using gabor co-occurrences.\nAdvances in Visual Computing,\npages 699-677, 2010.\n\nT. Beghin, J. Cope, P. Remagnino, and S. Barman.\nShape and texture based plant leaf classification. \nIn: Advanced Concepts for Intelligent Vision Systems,\npages 345-353. Springer, 2010.",
"taskType": "classification",
"taskSubType": "multiClass",
"problemVersion": "2.0",
"problemSchemaVersion": "3.2.0"
"problemVersion": "4.0.0",
"problemSchemaVersion": "4.0.0",
"taskKeywords": [
"classification",
"multiClass",
"tabular"
]
},
"inputs": {
"data": [
......@@ -28,7 +31,27 @@
"stratified": true,
"numRepeats": 0,
"randomSeed": 42,
"splitsFile": "dataSplits.csv"
"splitsFile": "dataSplits.csv",
"datasetViewMaps": {
"train": [
{
"from": "1491_one_hundred_plants_margin_dataset",
"to": "1491_one_hundred_plants_margin_dataset_TRAIN"
}
],
"test": [
{
"from": "1491_one_hundred_plants_margin_dataset",
"to": "1491_one_hundred_plants_margin_dataset_TEST"
}
],
"score": [
{
"from": "1491_one_hundred_plants_margin_dataset",
"to": "1491_one_hundred_plants_margin_dataset_SCORE"
}
]
}
},
"performanceMetrics": [
{
......@@ -39,4 +62,4 @@
"expectedOutputs": {
"predictionsFile": "predictions.csv"
}
}
}
\ No newline at end of file
......@@ -4,19 +4,21 @@
"datasetName": "NULL",
"license": "CC-BY license",
"approximateSize": "",
"datasetVersion": "2.0",
"datasetSchemaVersion": "3.2.0",
"datasetVersion": "4.0.0",
"datasetSchemaVersion": "4.0.0",
"redacted": true,
"digest": "b8cfa050188ff8ec11685bcbf6681ee8cd12b2a427de235bfda099f093aae4c5"
"digest": "f09aa791b78acdcfdc8d6acd7bb5214a3c4e37515c9b363bbb83687b7d8d8eef"
},
"dataResources": [
{
"resID": "learningData",
"resPath": "tables/learningData.csv",
"resType": "table",
"resFormat": [
"text/csv"
],