DeepLearning(2)

数据处理与数据加载

Dataset

abstract class

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
from torch.utils.data import Dataset

class DiabetesDataset(Dataset):
# 两种实现方法:1.数据集不大情况下,可以直接读到内存中 2.数据量很大情况,打包到文件,文件列表
def _init_(self,FilePath):
# 使用numpy的loadtext读取FilePath路径下的csv文件,用,隔开,类型为32位float
xy = np.loadtext(FilePath,delimiter=',',dtype=np.float32)
# len即数据集的维度
self.len = xy.shape[0]
# 数据集
self.x_data = torch.from_numpy(xy[:,:-1])
# label列,这里是认为label在数据集的最后一列,这里[-1]是因为若直接-1,读取出来的是标量会导致数据类型不一致问题,因为pytorch是tensor计算
self.y_data = torch.from_numpy(xy[:,[-1]])

# 下标获取元素
def _getitem_(self,index):
return self.x_data[index],self.y_data[index]

# 长度
def _len_(self):
return self.len

dataset = DiabetesDataset('traindata.csv')

DataLoader

1
2
3
4
5
6
7
8
9
from torch.utils.data import DataLoader

'''
dataset :Dataset的实现类
batch_size:批batch大小
shuffle:是否打乱顺序
num_workers:需要多少线程并行读数据
'''
train_loader = DataLoader(dataset=dataset,batch_size=(int),shuffle=True,num_workers=2)

图片数据归一化处理

什么是图像归一化:

图像归一化是指对图像进行了一系列标准的处理变换,使之变换为一固定标准形式的过程,该标准图像称作归一化图像。原始图像在经历一些处理或攻击后可以得到多种副本图像,这些图像在经过相同参数的图像归一化处理后能够得到相同形式的标准图像。

自定义Dataset制造imbalanceMNIST数据集

尝试用CNN做一次不平衡数据集检测,但是之前使用的是pytorch自带的MNIST Dataset,想处理不同数据,于是参照MNIST的源码自己实现了一个Dataset,想减少多少数据都可以,用来做imbalance测试。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
import os
import torch
import pandas as pd
import numpy as np
import torch.nn.functional as F
import torch.optim as optim
from torchvision import transforms
from torchvision import datasets
from torch.utils.data import DataLoader
from torch.utils.data import Dataset
from PIL import Image

# unbalance
class UnBalancedDataset(Dataset):
train_file = 'training.pt'
test_file = 'test.pt'

def __init__(self,root,train=True,transform=None,filterlist=[0,0,0,0,0,0,0,0,0,0],**kwargs):
self.root = root
self.transform = transform
self.train = train
self.filterlist = filterlist
self._change_data()


if not self._check_exists():
raise RuntimeError('Dataset not found.')

if self.train:
data_file = self.train_file

else:
data_file = self.test_file

self.data, self.targets = torch.load(os.path.join(self.root, 'MNIST', 'processed', data_file))

self._filter()

def __getitem__(self, index):
img,target = self.data[index],self.targets[index] # img.shape tensor([28,28])

img = Image.fromarray(img.numpy()) # numpy -> PIL image

if self.transform is not None:
img = self.transform(img) # img.size ([1,28,28])

return img,target

def __len__(self):
return len(self.data)


def _check_exists(self):
return (os.path.exists(os.path.join(self.root, 'MNIST', 'processed', self.train_file)) and os.path.exists(
os.path.join(self.root, 'MNIST', 'processed', self.test_file)))


def _change_data(self):
for i in range(len(self.filterlist)):
if(self.filterlist[i] != 0):
self.filterlist[i] = 6000 - self.filterlist[i]

def _filter(self):
n = np.arange(10)
show = pd.Series(self.filterlist,n)
print(show)
i = 0
print('\tDeal with train_set...')
for image,target in zip(self.data,self.targets):

if(target == 0 and self.filterlist[0] > 0):
self.data = self.data[torch.arange(self.data.size(0))!=i]
self.targets = self.targets[torch.arange(self.targets.size(0))!=i]
self.filterlist[0] -= 1
i -= 1

if(target == 1 and self.filterlist[1] > 0):
self.data = self.data[torch.arange(self.data.size(0))!=i]
self.targets = self.targets[torch.arange(self.targets.size(0))!=i]
self.filterlist[1] -= 1
i -= 1

if(target == 2 and self.filterlist[2] > 0):
self.data = self.data[torch.arange(self.data.size(0))!=i]
self.targets = self.targets[torch.arange(self.targets.size(0))!=i]
self.filterlist[2] -= 1
i -= 1

if(target == 3 and self.filterlist[3] > 0):
self.data = self.data[torch.arange(self.data.size(0))!=i]
self.targets = self.targets[torch.arange(self.targets.size(0))!=i]
self.filterlist[3] -= 1
i -= 1

if(target == 4 and self.filterlist[4] > 0):
self.data = self.data[torch.arange(self.data.size(0))!=i]
self.targets = self.targets[torch.arange(self.targets.size(0))!=i]
self.filterlist[4] -= 1
i -= 1

if(target == 5 and self.filterlist[5] > 0):
self.data = self.data[torch.arange(self.data.size(0))!=i]
self.targets = self.targets[torch.arange(self.targets.size(0))!=i]
self.filterlist[5] -= 1
i -= 1

if(target == 6 and self.filterlist[6] > 0):
self.data = self.data[torch.arange(self.data.size(0))!=i]
self.targets = self.targets[torch.arange(self.targets.size(0))!=i]
self.filterlist[6] -= 1
i -= 1

if(target == 7 and self.filterlist[7] > 0):
self.data = self.data[torch.arange(self.data.size(0))!=i]
self.targets = self.targets[torch.arange(self.targets.size(0))!=i]
self.filterlist[7] -= 1
i -= 1

if(target == 8 and self.filterlist[8] > 0):
self.data = self.data[torch.arange(self.data.size(0))!=i]
self.targets = self.targets[torch.arange(self.targets.size(0))!=i]
self.filterlist[8] -= 1
i -= 1

if(target == 9 and self.filterlist[9] > 0):
self.data = self.data[torch.arange(self.data.size(0))!=i]
self.targets = self.targets[torch.arange(self.targets.size(0))!=i]
self.filterlist[9] -= 1
i -= 1

i += 1
0%