33 lines
1.0 KiB
Python
33 lines
1.0 KiB
Python
from util import *
|
|
"""将positive data中各种size的cell均衡化,避免某一类cell过多"""
|
|
NB_EACH_BIN = 1000
|
|
|
|
filename = './data/rect_data.csv'
|
|
data = np.loadtxt(filename, dtype=np.int32, comments='#', delimiter=' ')
|
|
np.random.shuffle(data)
|
|
X = data[:,4:]
|
|
y = data[:,0:4]
|
|
area = np.array([(yy[1]-yy[0])*(yy[3]-yy[2]) for yy in list(y)])
|
|
sepes = np.histogram(area, bins=20)[1]
|
|
counts = np.histogram(area, bins=20)[0]
|
|
|
|
data2 = list()
|
|
for idx in range(20):
|
|
data2.append([])
|
|
for idx in range(data.shape[0]):
|
|
for ii in range(20):
|
|
if(sepes[ii] <= area[idx] < sepes[ii+1]):
|
|
data2[ii].append(data[idx])
|
|
|
|
data3 = []
|
|
for idx_bin in range(20):
|
|
if(len(data2[idx_bin]) != 0):
|
|
data3.append(np.vstack(tuple(data2[idx_bin])))
|
|
|
|
for idx_bin in range(len(data3)):
|
|
if(data3[idx_bin].shape[0] > NB_EACH_BIN):
|
|
np.random.shuffle(data3[idx_bin])
|
|
data3[idx_bin] = data3[idx_bin][0:NB_EACH_BIN, :]
|
|
data4 = np.vstack(tuple(data3))
|
|
np.savetxt('./data/rect_data2.csv', data4, fmt='%d', header = 'rect_examples, first 4 cols are target')
|