Commit 5c146982 authored by Swaroop Vattam's avatar Swaroop Vattam
Browse files

synced /LL0_186_braziltourism dataset

parent fa6f3229
Pipeline #25 passed with stage
in 55 minutes and 18 seconds
...@@ -11,7 +11,7 @@ ...@@ -11,7 +11,7 @@
"datasetSchemaVersion": "4.0.0", "datasetSchemaVersion": "4.0.0",
"redacted": false, "redacted": false,
"datasetVersion": "4.0.0", "datasetVersion": "4.0.0",
"digest": "a3e4ec6bc119785f08a00187f5cf8717a0fd3ed9f50a44d5885dba977953bf56" "digest": "a9b3df16939c440aa43940c28ffb0e2dc0c40525baf4583cb1168d33795ca4b1"
}, },
"dataResources": [ "dataResources": [
{ {
......
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
""" """
Given the path to a data file (learningData.csv), this script generates a split Given the path to a data file (learningData.csv), this script generates a split
and saves it in output directory. and saves it in output directory.
@author: JO21372 @author: JO21372
""" """
import os import os
import csv import csv
import sys import sys
import numpy as np import numpy as np
import d3m_utils as utils import d3m_utils as utils
from collections import Counter from collections import Counter
from sklearn.model_selection import train_test_split from sklearn.model_selection import train_test_split
DEFAULT_TRAIN_FRACTION = .8 DEFAULT_TRAIN_FRACTION = .8
SPLIT_RANDOM_STATE = 42 SPLIT_RANDOM_STATE = 42
TYPE_TEST = "TEST" TYPE_TEST = "TEST"
TYPE_TRAIN = "TRAIN" TYPE_TRAIN = "TRAIN"
def readData(dataFilePath, classKey = 'class', d3mIndexKey = 'd3mIndex'): def readData(dataFilePath, classKey = 'class', d3mIndexKey = 'd3mIndex'):
""" Returns 3 lists: """ Returns 3 lists:
1. d3mInds - d3m indices of samples with labels 1. d3mInds - d3m indices of samples with labels
2. klasses - lables of samples indexed by corresponding elements in 1. 2. klasses - lables of samples indexed by corresponding elements in 1.
3. missingLabelsInds - d3m indices of samples with missing labels 3. missingLabelsInds - d3m indices of samples with missing labels
Note: Indices of samples that do not have a label don't appear in d3mInds. Note: Indices of samples that do not have a label don't appear in d3mInds.
Arguments: Arguments:
1. dataFilePath - path to a data file (learningData.csv) 1. dataFilePath - path to a data file (learningData.csv)
2. classKey (default: 'class') - key of class column 2. classKey (default: 'class') - key of class column
3. d3mIndexKey (default: 'd3mIndex') - key of d3m index column 3. d3mIndexKey (default: 'd3mIndex') - key of d3m index column
""" """
d3mInds = [] d3mInds = []
klasses = [] klasses = []
missingLabelInds = [] missingLabelInds = []
with open(dataFilePath, mode='r') as csv_file: with open(dataFilePath, mode='r') as csv_file:
csv_reader = csv.DictReader(csv_file) csv_reader = csv.DictReader(csv_file)
line_count = 0 line_count = 0
for row in csv_reader: for row in csv_reader:
#Print headers #Print headers
if line_count == 0: if line_count == 0:
print(f'Column names are {", ".join(row)}') print(f'Column names are {", ".join(row)}')
klass = row[classKey] klass = row[classKey]
d3mIndex = row[d3mIndexKey] d3mIndex = row[d3mIndexKey]
#Check if label is missing and record #Check if label is missing and record
if klass.strip() == "": if klass.strip() == "":
missingLabelInds.append(d3mIndex) missingLabelInds.append(d3mIndex)
else: else:
d3mInds.append(d3mIndex) d3mInds.append(d3mIndex)
klasses.append(klass) klasses.append(klass)
line_count += 1 line_count += 1
print(f'Processed {line_count} lines.') print(f'Processed {line_count} lines.')
print(f'Number of samples with no label: {len(missingLabelInds)}') print(f'Number of samples with no label: {len(missingLabelInds)}')
print(f'Num samples with labels: {len(d3mInds)}') print(f'Num samples with labels: {len(d3mInds)}')
print(f'Num total of samples: {len(d3mInds) + len(missingLabelInds)}') print(f'Num total of samples: {len(d3mInds) + len(missingLabelInds)}')
return d3mInds, klasses, missingLabelInds return d3mInds, klasses, missingLabelInds
def getNTest(nInstances, trainFraction): def getNTest(nInstances, trainFraction):
return round((1 - trainFraction) * nInstances) return round((1 - trainFraction) * nInstances)
def safeTrainTestSplit(indices, missingLabelInds, labels, doStratify, trainFraction, randState = SPLIT_RANDOM_STATE): def safeTrainTestSplit(indices, missingLabelInds, labels, doStratify, trainFraction, randState = SPLIT_RANDOM_STATE):
""" """
Returns two numpy arrays containing the d3m indices of the train and test samples Returns two numpy arrays containing the d3m indices of the train and test samples
respectively. respectively.
""" """
print(f'Splitting samples into a {trainFraction * 100} % train set.') print(f'Splitting samples into a {trainFraction * 100} % train set.')
## Classes with one samples should be added to the train set after the split has been made. ## Classes with one samples should be added to the train set after the split has been made.
#Verify whether there are classes with just one sample. #Verify whether there are classes with just one sample.
classesWithOneSample = set() classesWithOneSample = set()
counts = Counter(labels) counts = Counter(labels)
#print(counts) #print(counts)
for y, count in counts.items(): for y, count in counts.items():
if count < 2: if count < 2:
print(f"** WARNING: Dataset contains only 1 sample of class: {y} **") print(f"** WARNING: Dataset contains only 1 sample of class: {y} **")
classesWithOneSample.add(y) classesWithOneSample.add(y)
if len(classesWithOneSample) == 0: if len(classesWithOneSample) == 0:
filteredIndices = indices filteredIndices = indices
filteredLabels = labels filteredLabels = labels
lonelyIndices = [] lonelyIndices = []
else: else:
filteredIndices = [] filteredIndices = []
filteredLabels = [] filteredLabels = []
lonelyIndices = [] lonelyIndices = []
for i in range(len(indices)): for i in range(len(indices)):
indx = indices[i] indx = indices[i]
label = labels[i] label = labels[i]
if label in classesWithOneSample: if label in classesWithOneSample:
lonelyIndices.append(indx) lonelyIndices.append(indx)
else: else:
filteredIndices.append(indx) filteredIndices.append(indx)
filteredLabels.append(label) filteredLabels.append(label)
#Get test sample size #Get test sample size
nTest = getNTest(len(filteredIndices), trainFraction) nTest = getNTest(len(filteredIndices), trainFraction)
#Stratify? #Stratify?
stratify = None stratify = None
if doStratify: if doStratify:
stratify = filteredLabels stratify = filteredLabels
#Split #Split
print(f'Splitting: random_state = {randState}, test_size = {nTest}, stratify={doStratify}\n') print(f'Splitting: random_state = {randState}, test_size = {nTest}, stratify={doStratify}\n')
indxTrain, indxTest = train_test_split(np.array(filteredIndices), test_size=nTest, random_state=randState, stratify=stratify) indxTrain, indxTest = train_test_split(np.array(filteredIndices), test_size=nTest, random_state=randState, stratify=stratify)
#If samples with missing labels are present in dataset, add them to train set #If samples with missing labels are present in dataset, add them to train set
if len(missingLabelInds) > 0: if len(missingLabelInds) > 0:
print(f"\n** WARNING: Number of missing labels: {len(missingLabelInds)}. Adding those samples to train set.**\n") print(f"\n** WARNING: Number of missing labels: {len(missingLabelInds)}. Adding those samples to train set.**\n")
indxTrain = np.append(indxTrain, missingLabelInds) indxTrain = np.append(indxTrain, missingLabelInds)
#Add lonely samples to the train set as well #Add lonely samples to the train set as well
if len(lonelyIndices) > 0: if len(lonelyIndices) > 0:
indxTrain = np.append(indxTrain, lonelyIndices) indxTrain = np.append(indxTrain, lonelyIndices)
return indxTrain, indxTest return indxTrain, indxTest
def writeDataSplitFile(indxTrain, indxTest, outDir, splitFile = 'dataSplits.csv'): def writeDataSplitFile(indxTrain, indxTest, outDir, splitFile = 'dataSplits.csv'):
res = [] res = []
for ind in indxTrain: for ind in indxTrain:
res.append((ind,TYPE_TRAIN)) res.append((ind,TYPE_TRAIN))
for ind in indxTest: for ind in indxTest:
res.append((ind, TYPE_TEST)) res.append((ind, TYPE_TEST))
outFile = os.path.join(outDir, splitFile) outFile = os.path.join(outDir, splitFile)
print(f'Writing split to file {outFile}') print(f'Writing split to file {outFile}')
#sort rows #sort rows
res.sort(key=lambda tup:int(tup[0])) res.sort(key=lambda tup:int(tup[0]))
#Write file #Write file
with open(outFile, 'w') as outF: with open(outFile, 'w') as outF:
#Write header #Write header
outF.write("d3mIndex,type,repeat,fold\n") outF.write("d3mIndex,type,repeat,fold\n")
for tup in res: for tup in res:
outF.write(tup[0] + "," + tup[1] + "," + '0,0\n') outF.write(tup[0] + "," + tup[1] + "," + '0,0\n')
def generateSplitForDataset (corporaBaseDir, datasetName): def generateSplitForDataset (corporaBaseDir, datasetName):
dataFilePath = os.path.join(corporaBaseDir, datasetName, datasetName + "_dataset", 'tables', 'learningData.csv') dataFilePath = os.path.join(corporaBaseDir, datasetName, datasetName + "_dataset", 'tables', 'learningData.csv')
outDir = utils.getProblemDir(corporaBaseDir, datasetName) outDir = utils.getProblemDir(corporaBaseDir, datasetName)
testRatio, doStratify, randomSeed, splitsFile, classColName = utils.getSplitParameters(corporaBaseDir, datasetName) testRatio, doStratify, randomSeed, splitsFile, classColName = utils.getSplitParameters(corporaBaseDir, datasetName)
d3mInds, labels, missingLabelInds = readData(dataFilePath, classKey=classColName) d3mInds, labels, missingLabelInds = readData(dataFilePath, classKey=classColName)
indxTrain, indxTest = safeTrainTestSplit(d3mInds, missingLabelInds, labels, doStratify, 1 - testRatio, randomSeed) indxTrain, indxTest = safeTrainTestSplit(d3mInds, missingLabelInds, labels, doStratify, 1 - testRatio, randomSeed)
writeDataSplitFile(indxTrain, indxTest, outDir, splitsFile) writeDataSplitFile(indxTrain, indxTest, outDir, splitsFile)
#Report #Report
#print(indxTrain) #print(indxTrain)
#print(indxTest) #print(indxTest)
totalNumSamples = len(indxTrain) + len(indxTest) totalNumSamples = len(indxTrain) + len(indxTest)
numTrainSamples = len(indxTrain) numTrainSamples = len(indxTrain)
numTestSamples = len(indxTest) numTestSamples = len(indxTest)
print(f"Num of train samples: {numTrainSamples} ({numTrainSamples/totalNumSamples * 100}%)") print(f"Num of train samples: {numTrainSamples} ({numTrainSamples/totalNumSamples * 100}%)")
print(f"Num of test samples: {numTestSamples} ({numTestSamples/totalNumSamples * 100}%)") print(f"Num of test samples: {numTestSamples} ({numTestSamples/totalNumSamples * 100}%)")
if __name__ == '__main__': if __name__ == '__main__':
""" """
corporaBaseDir - directory where corpora is. corporaBaseDir - directory where corpora is.
datasetListFile - path to file containing names of datasets to be processed. datasetListFile - path to file containing names of datasets to be processed.
""" """
if len(sys.argv) != 3: if len(sys.argv) != 3:
print("Usage: python splitData.py <corporaBaseDir> <datasetListFile>") print("Usage: python splitData.py <corporaBaseDir> <datasetListFile>")
sys.exit() sys.exit()
corporaBaseDir = sys.argv[1] corporaBaseDir = sys.argv[1]
datasetListFile = sys.argv[2] datasetListFile = sys.argv[2]
datasets = utils.readListFromFile(datasetListFile) datasets = utils.readListFromFile(datasetListFile)
for ds in datasets: for ds in datasets:
print(f'\n\nProcessing dataset {ds}') print(f'\n\nProcessing dataset {ds}')
generateSplitForDataset(corporaBaseDir, ds) generateSplitForDataset(corporaBaseDir, ds)
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
""" """
Given the path to a data file (learningData.csv), this script generates a split Given the path to a data file (learningData.csv), this script generates a split
and saves it in output directory. and saves it in output directory.
@author: JO21372 @author: JO21372
""" """
import os import os
import csv import csv
import sys import sys
import numpy as np import numpy as np
import d3m_utils as utils import d3m_utils as utils
from collections import Counter from collections import Counter
from sklearn.model_selection import train_test_split from sklearn.model_selection import train_test_split
DEFAULT_TRAIN_FRACTION = .8 DEFAULT_TRAIN_FRACTION = .8
SPLIT_RANDOM_STATE = 42 SPLIT_RANDOM_STATE = 42
TYPE_TEST = "TEST" TYPE_TEST = "TEST"
TYPE_TRAIN = "TRAIN" TYPE_TRAIN = "TRAIN"
def readData(dataFilePath, classKey = 'class', d3mIndexKey = 'd3mIndex'): def readData(dataFilePath, classKey = 'class', d3mIndexKey = 'd3mIndex'):
""" Returns 3 lists: """ Returns 3 lists:
1. d3mInds - d3m indices of samples with labels 1. d3mInds - d3m indices of samples with labels
2. klasses - lables of samples indexed by corresponding elements in 1. 2. klasses - lables of samples indexed by corresponding elements in 1.
3. missingLabelsInds - d3m indices of samples with missing labels 3. missingLabelsInds - d3m indices of samples with missing labels
Note: Indices of samples that do not have a label don't appear in d3mInds. Note: Indices of samples that do not have a label don't appear in d3mInds.
Arguments: Arguments:
1. dataFilePath - path to a data file (learningData.csv) 1. dataFilePath - path to a data file (learningData.csv)
2. classKey (default: 'class') - key of class column 2. classKey (default: 'class') - key of class column
3. d3mIndexKey (default: 'd3mIndex') - key of d3m index column 3. d3mIndexKey (default: 'd3mIndex') - key of d3m index column
""" """
d3mInds = [] d3mInds = []
klasses = [] klasses = []
missingLabelInds = [] missingLabelInds = []
with open(dataFilePath, mode='r') as csv_file: with open(dataFilePath, mode='r') as csv_file:
csv_reader = csv.DictReader(csv_file) csv_reader = csv.DictReader(csv_file)
line_count = 0 line_count = 0
for row in csv_reader: for row in csv_reader:
#Print headers #Print headers
if line_count == 0: if line_count == 0:
print(f'Column names are {", ".join(row)}') print(f'Column names are {", ".join(row)}')
klass = row[classKey] klass = row[classKey]
d3mIndex = row[d3mIndexKey] d3mIndex = row[d3mIndexKey]
#Check if label is missing and record #Check if label is missing and record
if klass.strip() == "": if klass.strip() == "":
missingLabelInds.append(d3mIndex) missingLabelInds.append(d3mIndex)
else: else:
d3mInds.append(d3mIndex) d3mInds.append(d3mIndex)
klasses.append(klass) klasses.append(klass)
line_count += 1 line_count += 1
print(f'Processed {line_count} lines.') print(f'Processed {line_count} lines.')
print(f'Number of samples with no label: {len(missingLabelInds)}') print(f'Number of samples with no label: {len(missingLabelInds)}')
print(f'Num samples with labels: {len(d3mInds)}') print(f'Num samples with labels: {len(d3mInds)}')
print(f'Num total of samples: {len(d3mInds) + len(missingLabelInds)}') print(f'Num total of samples: {len(d3mInds) + len(missingLabelInds)}')
return d3mInds, klasses, missingLabelInds return d3mInds, klasses, missingLabelInds
def getNTest(nInstances, trainFraction): def getNTest(nInstances, trainFraction):
return round((1 - trainFraction) * nInstances) return round((1 - trainFraction) * nInstances)
def safeTrainTestSplit(indices, missingLabelInds, labels, doStratify, trainFraction, randState = SPLIT_RANDOM_STATE): def safeTrainTestSplit(indices, missingLabelInds, labels, doStratify, trainFraction, randState = SPLIT_RANDOM_STATE):
""" """
Returns two numpy arrays containing the d3m indices of the train and test samples Returns two numpy arrays containing the d3m indices of the train and test samples
respectively. respectively.
""" """
print(f'Splitting samples into a {trainFraction * 100} % train set.') print(f'Splitting samples into a {trainFraction * 100} % train set.')
## Classes with one samples should be added to the train set after the split has been made. ## Classes with one samples should be added to the train set after the split has been made.
#Verify whether there are classes with just one sample. #Verify whether there are classes with just one sample.
classesWithOneSample = set() classesWithOneSample = set()
counts = Counter(labels) counts = Counter(labels)
#print(counts) #print(counts)
for y, count in counts.items(): for y, count in counts.items():
if count < 2: if count < 2:
print(f"** WARNING: Dataset contains only 1 sample of class: {y} **") print(f"** WARNING: Dataset contains only 1 sample of class: {y} **")
classesWithOneSample.add(y) classesWithOneSample.add(y)
if len(classesWithOneSample) == 0: if len(classesWithOneSample) == 0:
filteredIndices = indices filteredIndices = indices
filteredLabels = labels filteredLabels = labels
lonelyIndices = [] lonelyIndices = []
else: else:
filteredIndices = [] filteredIndices = []
filteredLabels = [] filteredLabels = []
lonelyIndices = [] lonelyIndices = []
for i in range(len(indices)): for i in range(len(indices)):
indx = indices[i] indx = indices[i]
label = labels[i] label = labels[i]
if label in classesWithOneSample: if label in classesWithOneSample:
lonelyIndices.append(indx) lonelyIndices.append(indx)
else: else:
filteredIndices.append(indx) filteredIndices.append(indx)
filteredLabels.append(label) filteredLabels.append(label)
#Get test sample size #Get test sample size
nTest = getNTest(len(filteredIndices), trainFraction) nTest = getNTest(len(filteredIndices), trainFraction)
#Stratify? #Stratify?
stratify = None stratify = None
if doStratify: if doStratify:
stratify = filteredLabels stratify = filteredLabels
#Split #Split
print(f'Splitting: random_state = {randState}, test_size = {nTest}, stratify={doStratify}\n') print(f'Splitting: random_state = {randState}, test_size = {nTest}, stratify={doStratify}\n')
indxTrain, indxTest = train_test_split(np.array(filteredIndices), test_size=nTest, random_state=randState, stratify=stratify) indxTrain, indxTest = train_test_split(np.array(filteredIndices), test_size=nTest, random_state=randState, stratify=stratify)