Commit f7ab6393 authored by Swaroop Vattam's avatar Swaroop Vattam
Browse files

sync setp1

parent 6420b4ce
...@@ -42840,3 +42840,7 @@ seed_datasets_current/32_fma_MIN_METADATA/TRAIN/dataset_TRAIN/media/127350.mp3 f ...@@ -42840,3 +42840,7 @@ seed_datasets_current/32_fma_MIN_METADATA/TRAIN/dataset_TRAIN/media/127350.mp3 f
seed_datasets_current/32_fma_MIN_METADATA/TRAIN/dataset_TRAIN/media/127996.mp3 filter=lfs diff=lfs merge=lfs -text seed_datasets_current/32_fma_MIN_METADATA/TRAIN/dataset_TRAIN/media/127996.mp3 filter=lfs diff=lfs merge=lfs -text
seed_datasets_current/32_fma_MIN_METADATA/TRAIN/dataset_TRAIN/media/123342.mp3 filter=lfs diff=lfs merge=lfs -text seed_datasets_current/32_fma_MIN_METADATA/TRAIN/dataset_TRAIN/media/123342.mp3 filter=lfs diff=lfs merge=lfs -text
seed_datasets_current/32_fma_MIN_METADATA/TRAIN/dataset_TRAIN/media/126405.mp3 filter=lfs diff=lfs merge=lfs -text seed_datasets_current/32_fma_MIN_METADATA/TRAIN/dataset_TRAIN/media/126405.mp3 filter=lfs diff=lfs merge=lfs -text
training_datasets/seed_datasets_archive/1491_one_hundred_plants_margin_clust/SCORE/dataset_TEST/tables/learningData.csv filter=lfs diff=lfs merge=lfs -text
training_datasets/seed_datasets_archive/1491_one_hundred_plants_margin_clust/1491_one_hundred_plants_margin_clust_dataset/tables/learningData.csv filter=lfs diff=lfs merge=lfs -text
training_datasets/seed_datasets_archive/1491_one_hundred_plants_margin_clust/TRAIN/dataset_TRAIN/tables/learningData.csv filter=lfs diff=lfs merge=lfs -text
training_datasets/seed_datasets_archive/1491_one_hundred_plants_margin_clust/TEST/dataset_TEST/tables/learningData.csv filter=lfs diff=lfs merge=lfs -text
...@@ -8,10 +8,10 @@ ...@@ -8,10 +8,10 @@
"source": "OpenML", "source": "OpenML",
"sourceURI": "http://www.openml.org/d/299", "sourceURI": "http://www.openml.org/d/299",
"approximateSize": "", "approximateSize": "",
"datasetVersion": "4.0.0", "datasetVersion": "4.1.0",
"datasetSchemaVersion": "4.0.0", "datasetSchemaVersion": "4.1.0",
"redacted": false, "redacted": false,
"digest": "84c5470f05abbf1ce79cbc0bc418d7e022e18307b28d44bd87545a61038c9988" "digest": "2c6c5d25f27965717f0198a83fff69f7385bba9f1700e9ef97ac67df5ed809a4"
}, },
"dataResources": [ "dataResources": [
{ {
......
...@@ -3,8 +3,8 @@ ...@@ -3,8 +3,8 @@
"problemID": "299_libras_move_MIN_METADATA_problem", "problemID": "299_libras_move_MIN_METADATA_problem",
"problemName": "libras_move", "problemName": "libras_move",
"problemDescription": "**Author**: Daniel Baptista Dias, Sarajane Marques Peres, Helton Hideraldo Biscaro \nUniversity of Sao Paulo, School of Art, Sciences and Humanities, Sao Paulo, SP, Brazil \n**Source**: Unknown - November 2008 \n**Please cite**: \n\n### LIBRAS Movement Database\nLIBRAS, acronym of the Portuguese name \"LIngua BRAsileira de Sinais\", is the official brazilian sign language. The dataset (movement_libras) contains 15 classes of 24 instances each, where each class references to a hand movement type in LIBRAS. The hand movement is represented as a bidimensional curve performed by the hand in a period of time. The curves were obtained from videos of hand movements, with the Libras performance from 4 different people, during 2 sessions. Each video corresponds to only one hand movement and has about $7$ seconds. Each video corresponds to a function F in a functions space which is the continual version of the input dataset. In the video pre-processing, a time normalization is carried out selecting 45 frames from each video, in according to an uniform distribution. In each frame, the centroid pixels of the segmented objects (the hand) are found, which compose the discrete version of the curve F with 45 points. All curves are normalized in the unitary space.\nIn order to prepare these movements to be analysed by algorithms, we have carried out a mapping operation, that is, each curve F is mapped in a representation with 90 features, with representing the coordinates of movement. \nEach instance represents 45 points on a bi-dimensional space, which can be plotted in an ordered way (from 1 through 45 as the X coordinate) in order to draw the path of the movement.", "problemDescription": "**Author**: Daniel Baptista Dias, Sarajane Marques Peres, Helton Hideraldo Biscaro \nUniversity of Sao Paulo, School of Art, Sciences and Humanities, Sao Paulo, SP, Brazil \n**Source**: Unknown - November 2008 \n**Please cite**: \n\n### LIBRAS Movement Database\nLIBRAS, acronym of the Portuguese name \"LIngua BRAsileira de Sinais\", is the official brazilian sign language. The dataset (movement_libras) contains 15 classes of 24 instances each, where each class references to a hand movement type in LIBRAS. The hand movement is represented as a bidimensional curve performed by the hand in a period of time. The curves were obtained from videos of hand movements, with the Libras performance from 4 different people, during 2 sessions. Each video corresponds to only one hand movement and has about $7$ seconds. Each video corresponds to a function F in a functions space which is the continual version of the input dataset. In the video pre-processing, a time normalization is carried out selecting 45 frames from each video, in according to an uniform distribution. In each frame, the centroid pixels of the segmented objects (the hand) are found, which compose the discrete version of the curve F with 45 points. All curves are normalized in the unitary space.\nIn order to prepare these movements to be analysed by algorithms, we have carried out a mapping operation, that is, each curve F is mapped in a representation with 90 features, with representing the coordinates of movement. \nEach instance represents 45 points on a bi-dimensional space, which can be plotted in an ordered way (from 1 through 45 as the X coordinate) in order to draw the path of the movement.",
"problemVersion": "4.0.0", "problemVersion": "4.1.0",
"problemSchemaVersion": "4.0.0", "problemSchemaVersion": "4.1.0",
"taskKeywords": [ "taskKeywords": [
"classification", "classification",
"multiClass", "multiClass",
...@@ -51,11 +51,11 @@ ...@@ -51,11 +51,11 @@
}, },
"performanceMetrics": [ "performanceMetrics": [
{ {
"metric": "accuracy" "metric": "rocAucMacro"
} }
] ]
}, },
"expectedOutputs": { "expectedOutputs": {
"predictionsFile": "predictions.csv" "predictionsFile": "predictions.csv"
} }
} }
\ No newline at end of file
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
""" """
Given the path to a data file (learningData.csv), this script generates a split Given the path to a data file (learningData.csv), this script generates a split
and saves it in output directory. and saves it in output directory.
@author: JO21372 @author: JO21372
""" """
import os import os
import csv import csv
import sys import sys
import numpy as np import numpy as np
import d3m_utils as utils import d3m_utils as utils
from collections import Counter from collections import Counter
from sklearn.model_selection import train_test_split from sklearn.model_selection import train_test_split
DEFAULT_TRAIN_FRACTION = .8 DEFAULT_TRAIN_FRACTION = .8
SPLIT_RANDOM_STATE = 42 SPLIT_RANDOM_STATE = 42
TYPE_TEST = "TEST" TYPE_TEST = "TEST"
TYPE_TRAIN = "TRAIN" TYPE_TRAIN = "TRAIN"
def readData(dataFilePath, classKey = 'class', d3mIndexKey = 'd3mIndex'): def readData(dataFilePath, classKey = 'class', d3mIndexKey = 'd3mIndex'):
""" Returns 3 lists: """ Returns 3 lists:
1. d3mInds - d3m indices of samples with labels 1. d3mInds - d3m indices of samples with labels
2. klasses - lables of samples indexed by corresponding elements in 1. 2. klasses - lables of samples indexed by corresponding elements in 1.
3. missingLabelsInds - d3m indices of samples with missing labels 3. missingLabelsInds - d3m indices of samples with missing labels
Note: Indices of samples that do not have a label don't appear in d3mInds. Note: Indices of samples that do not have a label don't appear in d3mInds.
Arguments: Arguments:
1. dataFilePath - path to a data file (learningData.csv) 1. dataFilePath - path to a data file (learningData.csv)
2. classKey (default: 'class') - key of class column 2. classKey (default: 'class') - key of class column
3. d3mIndexKey (default: 'd3mIndex') - key of d3m index column 3. d3mIndexKey (default: 'd3mIndex') - key of d3m index column
""" """
d3mInds = [] d3mInds = []
klasses = [] klasses = []
missingLabelInds = [] missingLabelInds = []
with open(dataFilePath, mode='r') as csv_file: with open(dataFilePath, mode='r') as csv_file:
csv_reader = csv.DictReader(csv_file) csv_reader = csv.DictReader(csv_file)
line_count = 0 line_count = 0
for row in csv_reader: for row in csv_reader:
#Print headers #Print headers
if line_count == 0: if line_count == 0:
print(f'Column names are {", ".join(row)}') print(f'Column names are {", ".join(row)}')
klass = row[classKey] klass = row[classKey]
d3mIndex = row[d3mIndexKey] d3mIndex = row[d3mIndexKey]
#Check if label is missing and record #Check if label is missing and record
if klass.strip() == "": if klass.strip() == "":
missingLabelInds.append(d3mIndex) missingLabelInds.append(d3mIndex)
else: else:
d3mInds.append(d3mIndex) d3mInds.append(d3mIndex)
klasses.append(klass) klasses.append(klass)
line_count += 1 line_count += 1
print(f'Processed {line_count} lines.') print(f'Processed {line_count} lines.')
print(f'Number of samples with no label: {len(missingLabelInds)}') print(f'Number of samples with no label: {len(missingLabelInds)}')
print(f'Num samples with labels: {len(d3mInds)}') print(f'Num samples with labels: {len(d3mInds)}')
print(f'Num total of samples: {len(d3mInds) + len(missingLabelInds)}') print(f'Num total of samples: {len(d3mInds) + len(missingLabelInds)}')
return d3mInds, klasses, missingLabelInds return d3mInds, klasses, missingLabelInds
def getNTest(nInstances, trainFraction): def getNTest(nInstances, trainFraction):
return round((1 - trainFraction) * nInstances) return round((1 - trainFraction) * nInstances)
def safeTrainTestSplit(indices, missingLabelInds, labels, doStratify, trainFraction, randState = SPLIT_RANDOM_STATE): def safeTrainTestSplit(indices, missingLabelInds, labels, doStratify, trainFraction, randState = SPLIT_RANDOM_STATE):
""" """
Returns two numpy arrays containing the d3m indices of the train and test samples Returns two numpy arrays containing the d3m indices of the train and test samples
respectively. respectively.
""" """
print(f'Splitting samples into a {trainFraction * 100} % train set.') print(f'Splitting samples into a {trainFraction * 100} % train set.')
## Classes with one samples should be added to the train set after the split has been made. ## Classes with one samples should be added to the train set after the split has been made.
#Verify whether there are classes with just one sample. #Verify whether there are classes with just one sample.
classesWithOneSample = set() classesWithOneSample = set()
counts = Counter(labels) counts = Counter(labels)
#print(counts) #print(counts)
for y, count in counts.items(): for y, count in counts.items():
if count < 2: if count < 2:
print(f"** WARNING: Dataset contains only 1 sample of class: {y} **") print(f"** WARNING: Dataset contains only 1 sample of class: {y} **")
classesWithOneSample.add(y) classesWithOneSample.add(y)
if len(classesWithOneSample) == 0: if len(classesWithOneSample) == 0:
filteredIndices = indices filteredIndices = indices
filteredLabels = labels filteredLabels = labels
lonelyIndices = [] lonelyIndices = []
else: else:
filteredIndices = [] filteredIndices = []
filteredLabels = [] filteredLabels = []
lonelyIndices = [] lonelyIndices = []
for i in range(len(indices)): for i in range(len(indices)):
indx = indices[i] indx = indices[i]
label = labels[i] label = labels[i]
if label in classesWithOneSample: if label in classesWithOneSample:
lonelyIndices.append(indx) lonelyIndices.append(indx)
else: else:
filteredIndices.append(indx) filteredIndices.append(indx)
filteredLabels.append(label) filteredLabels.append(label)
#Get test sample size #Get test sample size
nTest = getNTest(len(filteredIndices), trainFraction) nTest = getNTest(len(filteredIndices), trainFraction)
#Stratify? #Stratify?
stratify = None stratify = None
if doStratify: if doStratify:
stratify = filteredLabels stratify = filteredLabels
#Split #Split
print(f'Splitting: random_state = {randState}, test_size = {nTest}, stratify={doStratify}\n') print(f'Splitting: random_state = {randState}, test_size = {nTest}, stratify={doStratify}\n')
indxTrain, indxTest = train_test_split(np.array(filteredIndices), test_size=nTest, random_state=randState, stratify=stratify) indxTrain, indxTest = train_test_split(np.array(filteredIndices), test_size=nTest, random_state=randState, stratify=stratify)
#If samples with missing labels are present in dataset, add them to train set #If samples with missing labels are present in dataset, add them to train set
if len(missingLabelInds) > 0: if len(missingLabelInds) > 0:
print(f"\n** WARNING: Number of missing labels: {len(missingLabelInds)}. Adding those samples to train set.**\n") print(f"\n** WARNING: Number of missing labels: {len(missingLabelInds)}. Adding those samples to train set.**\n")
indxTrain = np.append(indxTrain, missingLabelInds) indxTrain = np.append(indxTrain, missingLabelInds)
#Add lonely samples to the train set as well #Add lonely samples to the train set as well
if len(lonelyIndices) > 0: if len(lonelyIndices) > 0:
indxTrain = np.append(indxTrain, lonelyIndices) indxTrain = np.append(indxTrain, lonelyIndices)
return indxTrain, indxTest return indxTrain, indxTest
def writeDataSplitFile(indxTrain, indxTest, outDir, splitFile = 'dataSplits.csv'): def writeDataSplitFile(indxTrain, indxTest, outDir, splitFile = 'dataSplits.csv'):
res = [] res = []
for ind in indxTrain: for ind in indxTrain:
res.append((ind,TYPE_TRAIN)) res.append((ind,TYPE_TRAIN))
for ind in indxTest: for ind in indxTest:
res.append((ind, TYPE_TEST)) res.append((ind, TYPE_TEST))
outFile = os.path.join(outDir, splitFile) outFile = os.path.join(outDir, splitFile)
print(f'Writing split to file {outFile}') print(f'Writing split to file {outFile}')
#sort rows #sort rows
res.sort(key=lambda tup:int(tup[0])) res.sort(key=lambda tup:int(tup[0]))
#Write file #Write file
with open(outFile, 'w') as outF: with open(outFile, 'w') as outF:
#Write header #Write header
outF.write("d3mIndex,type,repeat,fold\n") outF.write("d3mIndex,type,repeat,fold\n")
for tup in res: for tup in res:
outF.write(tup[0] + "," + tup[1] + "," + '0,0\n') outF.write(tup[0] + "," + tup[1] + "," + '0,0\n')
def generateSplitForDataset (corporaBaseDir, datasetName): def generateSplitForDataset (corporaBaseDir, datasetName):
dataFilePath = os.path.join(corporaBaseDir, datasetName, datasetName + "_dataset", 'tables', 'learningData.csv') dataFilePath = os.path.join(corporaBaseDir, datasetName, datasetName + "_dataset", 'tables', 'learningData.csv')
outDir = utils.getProblemDir(corporaBaseDir, datasetName) outDir = utils.getProblemDir(corporaBaseDir, datasetName)
testRatio, doStratify, randomSeed, splitsFile, classColName = utils.getSplitParameters(corporaBaseDir, datasetName) testRatio, doStratify, randomSeed, splitsFile, classColName = utils.getSplitParameters(corporaBaseDir, datasetName)
d3mInds, labels, missingLabelInds = readData(dataFilePath, classKey=classColName) d3mInds, labels, missingLabelInds = readData(dataFilePath, classKey=classColName)
indxTrain, indxTest = safeTrainTestSplit(d3mInds, missingLabelInds, labels, doStratify, 1 - testRatio, randomSeed) indxTrain, indxTest = safeTrainTestSplit(d3mInds, missingLabelInds, labels, doStratify, 1 - testRatio, randomSeed)
writeDataSplitFile(indxTrain, indxTest, outDir, splitsFile) writeDataSplitFile(indxTrain, indxTest, outDir, splitsFile)
#Report #Report
#print(indxTrain) #print(indxTrain)
#print(indxTest) #print(indxTest)
totalNumSamples = len(indxTrain) + len(indxTest) totalNumSamples = len(indxTrain) + len(indxTest)
numTrainSamples = len(indxTrain) numTrainSamples = len(indxTrain)
numTestSamples = len(indxTest) numTestSamples = len(indxTest)
print(f"Num of train samples: {numTrainSamples} ({numTrainSamples/totalNumSamples * 100}%)") print(f"Num of train samples: {numTrainSamples} ({numTrainSamples/totalNumSamples * 100}%)")
print(f"Num of test samples: {numTestSamples} ({numTestSamples/totalNumSamples * 100}%)") print(f"Num of test samples: {numTestSamples} ({numTestSamples/totalNumSamples * 100}%)")
if __name__ == '__main__': if __name__ == '__main__':
""" """
corporaBaseDir - directory where corpora is. corporaBaseDir - directory where corpora is.
datasetListFile - path to file containing names of datasets to be processed. datasetListFile - path to file containing names of datasets to be processed.
""" """
if len(sys.argv) != 3: if len(sys.argv) != 3:
print("Usage: python splitData.py <corporaBaseDir> <datasetListFile>") print("Usage: python splitData.py <corporaBaseDir> <datasetListFile>")
sys.exit() sys.exit()
corporaBaseDir = sys.argv[1] corporaBaseDir = sys.argv[1]
datasetListFile = sys.argv[2] datasetListFile = sys.argv[2]
datasets = utils.readListFromFile(datasetListFile) datasets = utils.readListFromFile(datasetListFile)
for ds in datasets: for ds in datasets:
print(f'\n\nProcessing dataset {ds}') print(f'\n\nProcessing dataset {ds}')
generateSplitForDataset(corporaBaseDir, ds) generateSplitForDataset(corporaBaseDir, ds)
...@@ -3,8 +3,8 @@ ...@@ -3,8 +3,8 @@
"problemID": "299_libras_move_MIN_METADATA_problem", "problemID": "299_libras_move_MIN_METADATA_problem",
"problemName": "libras_move", "problemName": "libras_move",
"problemDescription": "**Author**: Daniel Baptista Dias, Sarajane Marques Peres, Helton Hideraldo Biscaro \nUniversity of Sao Paulo, School of Art, Sciences and Humanities, Sao Paulo, SP, Brazil \n**Source**: Unknown - November 2008 \n**Please cite**: \n\n### LIBRAS Movement Database\nLIBRAS, acronym of the Portuguese name \"LIngua BRAsileira de Sinais\", is the official brazilian sign language. The dataset (movement_libras) contains 15 classes of 24 instances each, where each class references to a hand movement type in LIBRAS. The hand movement is represented as a bidimensional curve performed by the hand in a period of time. The curves were obtained from videos of hand movements, with the Libras performance from 4 different people, during 2 sessions. Each video corresponds to only one hand movement and has about $7$ seconds. Each video corresponds to a function F in a functions space which is the continual version of the input dataset. In the video pre-processing, a time normalization is carried out selecting 45 frames from each video, in according to an uniform distribution. In each frame, the centroid pixels of the segmented objects (the hand) are found, which compose the discrete version of the curve F with 45 points. All curves are normalized in the unitary space.\nIn order to prepare these movements to be analysed by algorithms, we have carried out a mapping operation, that is, each curve F is mapped in a representation with 90 features, with representing the coordinates of movement. \nEach instance represents 45 points on a bi-dimensional space, which can be plotted in an ordered way (from 1 through 45 as the X coordinate) in order to draw the path of the movement.", "problemDescription": "**Author**: Daniel Baptista Dias, Sarajane Marques Peres, Helton Hideraldo Biscaro \nUniversity of Sao Paulo, School of Art, Sciences and Humanities, Sao Paulo, SP, Brazil \n**Source**: Unknown - November 2008 \n**Please cite**: \n\n### LIBRAS Movement Database\nLIBRAS, acronym of the Portuguese name \"LIngua BRAsileira de Sinais\", is the official brazilian sign language. The dataset (movement_libras) contains 15 classes of 24 instances each, where each class references to a hand movement type in LIBRAS. The hand movement is represented as a bidimensional curve performed by the hand in a period of time. The curves were obtained from videos of hand movements, with the Libras performance from 4 different people, during 2 sessions. Each video corresponds to only one hand movement and has about $7$ seconds. Each video corresponds to a function F in a functions space which is the continual version of the input dataset. In the video pre-processing, a time normalization is carried out selecting 45 frames from each video, in according to an uniform distribution. In each frame, the centroid pixels of the segmented objects (the hand) are found, which compose the discrete version of the curve F with 45 points. All curves are normalized in the unitary space.\nIn order to prepare these movements to be analysed by algorithms, we have carried out a mapping operation, that is, each curve F is mapped in a representation with 90 features, with representing the coordinates of movement. \nEach instance represents 45 points on a bi-dimensional space, which can be plotted in an ordered way (from 1 through 45 as the X coordinate) in order to draw the path of the movement.",
"problemVersion": "4.0.0", "problemVersion": "4.1.0",
"problemSchemaVersion": "4.0.0", "problemSchemaVersion": "4.1.0",
"taskKeywords": [ "taskKeywords": [
"classification", "classification",
"multiClass", "multiClass",
...@@ -51,11 +51,11 @@ ...@@ -51,11 +51,11 @@
}, },
"performanceMetrics": [ "performanceMetrics": [
{ {
"metric": "accuracy" "metric": "rocAucMacro"
} }
] ]
}, },
"expectedOutputs": { "expectedOutputs": {
"predictionsFile": "predictions.csv" "predictionsFile": "predictions.csv"
} }
} }
\ No newline at end of file
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
""" """
Given the path to a data file (learningData.csv), this script generates a split Given the path to a data file (learningData.csv), this script generates a split
and saves it in output directory. and saves it in output directory.
@author: JO21372 @author: JO21372
""" """
import os import os
import csv import csv
import sys import sys
import numpy as np import numpy as np
import d3m_utils as utils import d3m_utils as utils
from collections import Counter from collections import Counter
from sklearn.model_selection import train_test_split from sklearn.model_selection import train_test_split
DEFAULT_TRAIN_FRACTION = .8 DEFAULT_TRAIN_FRACTION = .8
SPLIT_RANDOM_STATE = 42 SPLIT_RANDOM_STATE = 42
TYPE_TEST = "TEST" TYPE_TEST = "TEST"
TYPE_TRAIN = "TRAIN" TYPE_TRAIN = "TRAIN"
def readData(dataFilePath, classKey = 'class', d3mIndexKey = 'd3mIndex'): def readData(dataFilePath, classKey = 'class', d3mIndexKey = 'd3mIndex'):
""" Returns 3 lists: """ Returns 3 lists:
1. d3mInds - d3m indices of samples with labels 1. d3mInds - d3m indices of samples with labels
2. klasses - lables of samples indexed by corresponding elements in 1. 2. klasses - lables of samples indexed by corresponding elements in 1.
3. missingLabelsInds - d3m indices of samples with missing labels 3. missingLabelsInds - d3m indices of samples with missing labels
Note: Indices of samples that do not have a label don't appear in d3mInds. Note: Indices of samples that do not have a label don't appear in d3mInds.
Arguments: Arguments:
1. dataFilePath - path to a data file (learningData.csv) 1. dataFilePath - path to a data file (learningData.csv)
2. classKey (default: 'class') - key of class column 2. classKey (default: 'class') - key of class column
3. d3mIndexKey (default: 'd3mIndex') - key of d3m index column 3. d3mIndexKey (default: 'd3mIndex') - key of d3m index column
""" """
d3mInds = [] d3mInds = []
klasses = [] klasses = []
missingLabelInds = [] missingLabelInds = []
with open(dataFilePath, mode='r') as csv_file: with open(dataFilePath, mode='r') as csv_file:
csv_reader = csv.DictReader(csv_file) csv_reader = csv.DictReader(csv_file)
line_count = 0 line_count = 0
for row in csv_reader: for row in csv_reader:
#Print headers #Print headers
if line_count == 0: if line_count == 0:
print(f'Column names are {", ".join(row)}') print(f'Column names are {", ".join(row)}')
klass = row[classKey] klass = row[classKey]
d3mIndex = row[d3mIndexKey] d3mIndex = row[d3mIndexKey]
#Check if label is missing and record #Check if label is missing and record
if klass.strip() == "": if klass.strip() == "":
missingLabelInds.append(d3mIndex) missingLabelInds.append(d3mIndex)
else: else:
d3mInds.append(d3mIndex) d3mInds.append(d3mIndex)
klasses.append(klass) klasses.append(klass)
line_count += 1 line_count += 1
print(f'Processed {line_count} lines.') print(f'Processed {line_count} lines.')
print(f'Number of samples with no label: {len(missingLabelInds)}') print(f'Number of samples with no label: {len(missingLabelInds)}')
print(f'Num samples with labels: {len(d3mInds)}') print(f'Num samples with labels: {len(d3mInds)}')
print(f'Num total of samples: {len(d3mInds) + len(missingLabelInds)}') print(f'Num total of samples: {len(d3mInds) + len(missingLabelInds)}')
return d3mInds, klasses, missingLabelInds return d3mInds, klasses, missingLabelInds