@@ -37,7 +37,7 @@ def a9a(dataset_dir=None):
3737 a9a X train dataset (39073, 123)
3838 a9a y train dataset (39073, 1)
3939 a9a X test dataset (9769, 123)
40- a9a y train dataset (9769, 1)
40+ a9a y test dataset (9769, 1)
4141 """
4242 dataset_name = 'a9a'
4343 os .makedirs (dataset_dir , exist_ok = True )
@@ -75,7 +75,7 @@ def ijcnn(dataset_dir=None):
7575 ijcnn X train dataset (153344, 22)
7676 ijcnn y train dataset (153344, 1)
7777 ijcnn X test dataset (38337, 22)
78- ijcnn y train dataset (38337, 1)
78+ ijcnn y test dataset (38337, 1)
7979 """
8080 dataset_name = 'ijcnn'
8181 os .makedirs (dataset_dir , exist_ok = True )
@@ -113,7 +113,7 @@ def skin_segmentation(dataset_dir=None):
113113 skin_segmentation X train dataset (196045, 3)
114114 skin_segmentation y train dataset (196045, 1)
115115 skin_segmentation X test dataset (49012, 3)
116- skin_segmentation y train dataset (49012, 1)
116+ skin_segmentation y test dataset (49012, 1)
117117 """
118118 dataset_name = 'skin_segmentation'
119119 os .makedirs (dataset_dir , exist_ok = True )
@@ -151,7 +151,7 @@ def klaverjas(dataset_dir=None):
151151 klaverjas X train dataset (196045, 3)
152152 klaverjas y train dataset (196045, 1)
153153 klaverjas X test dataset (49012, 3)
154- klaverjas y train dataset (49012, 1)
154+ klaverjas y test dataset (49012, 1)
155155 """
156156 dataset_name = 'klaverjas'
157157 os .makedirs (dataset_dir , exist_ok = True )
@@ -184,7 +184,7 @@ def connect(dataset_dir=None):
184184 connect X train dataset (196045, 127)
185185 connect y train dataset (196045, 1)
186186 connect X test dataset (49012, 127)
187- connect y train dataset (49012, 1)
187+ connect y test dataset (49012, 1)
188188 """
189189 dataset_name = 'connect'
190190 os .makedirs (dataset_dir , exist_ok = True )
@@ -223,7 +223,7 @@ def mnist(dataset_dir=None):
223223 mnist X train dataset (60000, 784)
224224 mnist y train dataset (60000, 1)
225225 mnist X test dataset (10000, 784)
226- mnist y train dataset (10000, 1)
226+ mnist y test dataset (10000, 1)
227227 """
228228 dataset_name = 'mnist'
229229
@@ -258,7 +258,7 @@ def sensit(dataset_dir=None):
258258 sensit X train dataset (196045, 3)
259259 sensit y train dataset (196045, 1)
260260 sensit X test dataset (49012, 3)
261- sensit y train dataset (49012, 1)
261+ sensit y test dataset (49012, 1)
262262 """
263263 dataset_name = 'sensit'
264264 os .makedirs (dataset_dir , exist_ok = True )
@@ -285,12 +285,16 @@ def sensit(dataset_dir=None):
285285
286286def covertype (dataset_dir = None ):
287287 """
288-
289- covertype X train dataset (196045, 3)
290- covertype y train dataset (196045, 1)
291- covertype X test dataset (49012, 3)
292- covertype y train dataset (49012, 1)
293-
288+ Abstract: This is the original version of the famous
289+ covertype dataset in ARFF format.
290+ Author: Jock A. Blackard, Dr. Denis J. Dean, Dr. Charles W. Anderson
291+ Source: [original](https://archive.ics.uci.edu/ml/datasets/covertype)
292+
293+ Classification task. n_classes = 7.
294+ covertype X train dataset (390852, 54)
295+ covertype y train dataset (390852, 1)
296+ covertype X test dataset (97713, 54)
297+ covertype y test dataset (97713, 1)
294298 """
295299 dataset_name = 'covertype'
296300 os .makedirs (dataset_dir , exist_ok = True )
@@ -313,6 +317,41 @@ def covertype(dataset_dir=None):
313317 return True
314318
315319
320+ def codrnanorm (dataset_dir = None ):
321+ """
322+ Abstract: Detection of non-coding RNAs on the basis of predicted secondary
323+ structure formation free energy change.
324+ Author: Andrew V Uzilov,Joshua M Keegan,David H Mathews.
325+ Source: [original](http://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets)
326+
327+ Classification task. n_classes = 2.
328+ codrnanorm X train dataset (390852, 8)
329+ codrnanorm y train dataset (390852, 1)
330+ codrnanorm X test dataset (97713, 8)
331+ codrnanorm y test dataset (97713, 1)
332+ """
333+ dataset_name = 'codrnanorm'
334+ os .makedirs (dataset_dir , exist_ok = True )
335+
336+ X , y = fetch_openml (name = 'codrnaNorm' , return_X_y = True ,
337+ as_frame = False , data_home = dataset_dir )
338+ X = pd .DataFrame (X .todense ())
339+ y = pd .DataFrame (y )
340+
341+ logging .info (f'{ dataset_name } dataset is downloaded' )
342+ logging .info ('reading CSV file...' )
343+
344+ x_train , x_test , y_train , y_test = train_test_split (
345+ X , y , test_size = 0.2 , random_state = 42 )
346+ for data , name in zip ((x_train , x_test , y_train , y_test ),
347+ ('x_train' , 'x_test' , 'y_train' , 'y_test' )):
348+ filename = f'{ dataset_name } _{ name } .csv'
349+ data .to_csv (os .path .join (dataset_dir , filename ),
350+ header = False , index = False )
351+ logging .info (f'dataset { dataset_name } ready.' )
352+ return True
353+
354+
316355def gisette (dataset_dir = None ):
317356 """
318357 GISETTE is a handwritten digit recognition problem.
@@ -323,7 +362,7 @@ def gisette(dataset_dir=None):
323362 gisette X train dataset (6000, 5000)
324363 gisette y train dataset (6000, 1)
325364 gisette X test dataset (1000, 5000)
326- gisette y train dataset (1000, 1)
365+ gisette y test dataset (1000, 1)
327366 """
328367 dataset_name = 'gisette'
329368 os .makedirs (dataset_dir , exist_ok = True )
0 commit comments