Chuẩn hóa Min-Max
Chuẩn hóa zScore
Chia giỏ theo chiều rộng
Chia giỏ theo theo chiều sâu
Xử lý dữ liệu bị thiếu
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 |
import argparse import pandas as pd import math def status(message): print(message) def convert_number(data): return data.apply(pd.to_numeric, errors='coerce') def max_data(data): return data.max() def min_data(data): return data.min() def mean_data(data): return data.mean() def std_data(data): return data.std() def mode_data(data, prop): return data.mode()[prop][0] def sort_data(prop, data): data.sort_values(by=[prop], inplace = True) return data def set_index_data(prop, data): data.index= data[prop] data.drop(prop, axis=1, inplace=True) return data def min_max(prop, data): status("Min-max normalizing {"+prop+"} ...") data[prop] = convert_number(data[prop]) maxProp = max_data(data[prop]) minProp = min_data(data[prop]) for index, row in data.iterrows(): data.loc[index, prop] = (row[prop]-minProp)/(maxProp-minProp) return data def z_score(prop, data): status("Z-score normalizing {"+prop+"} ...") data[prop] = convert_number(data[prop]) meanProp = mean_data(data[prop]) sdProp = std_data(data[prop]) for index, row in data.iterrows(): data.loc[index, prop] = (row[prop]-meanProp)/sdProp return data def equal_depth(prop, data, bins): status("Equal depth binning {"+prop+"} "+str(bins)+" bins...") if bins > 0: data['new'] = data.index data = sort_data(prop, data) data.reset_index(drop=True, inplace = True) length = data[prop].count() n = math.ceil(length/bins) for i in range(0, bins): j = (i+1)*n for index, row in data.loc[i*n:j-1].iterrows(): data.loc[index, prop] = "B" + str(i+1) data = sort_data('new', data) data = set_index_data('new', data) return data def equal_width(prop, data, bins): status("Equal width binning {"+prop+"} "+str(bins)+" bins...") if bins > 0: data[prop] = convert_number(data[prop]) length = data[prop].count() maxProp = max_data(data[prop]) minProp = min_data(data[prop]) w = round((maxProp - minProp) / bins, 1) arr = [] for i in range(0, bins): arr.append(round(minProp + w * (i+1), 1)) for index, row in data.iterrows(): for i in range(0, bins): if i >= bins-2: data.loc[index, prop] = "(" + str(arr[i]) + "-inf)" break else: if row[prop] > arr[i] and row[prop] <= arr[i+1]: data.loc[index, prop] = "(" + str(arr[i]) + "-" + str(arr[i+1]) + "]" break elif row[prop] <= arr[i]: data.loc[index, prop] = "(-inf-" + str(arr[i]) + "]" break return data def remove_missing(prop, data): status("Removing missing {"+prop+"} ...") count = 0 for index, row in data.iterrows(): if pd.isna(row[prop]) or row[prop] == "nan" or row[prop] == "NaN" or row[prop] == "" or row[prop] == "?": data.drop(index, inplace=True) count += 1 print("Remove", count, "instance missing values") return data def fill_missing(prop, data): status("Filling missing {"+prop+"} ...") try: newData = mean_data(data[prop]) except: newData = mode_data(data, prop) count = 0 for index, row in data.iterrows(): if pd.isna(row[prop]) or row[prop] == "nan" or row[prop] == "NaN" or row[prop] == "" or row[prop] == "?": data.loc[index, prop] = newData count += 1 print("Fill", count, "missing values") return data def read_file(path): status("Reading file ...") try: reader = pd.read_csv(path) return reader except: print("File not found!") def write_file(path, data): status("Writing file ...") try: data.to_csv(path, index=False) except: print("Cannot write a file!") def action(task, props, data, option): for prop in props: if prop in data.columns: if task == 'minMax': min_max(prop, data) elif task == 'zScore': z_score(prop, data) elif task == 'equalWidth': equal_width(prop, data, option) elif task == 'equalDepth': equal_depth(prop, data, option) elif task == 'removeMissing': remove_missing(prop, data) elif task == 'fillMissing': fill_missing(prop, data) return data def main(args): data = read_file(args.input) if data is not None: data = action(args.task, list(args.prop.split(",")), data, args.bin) write_file(args.output, data) status("Finish!") if __name__ == "__main__": parser = argparse.ArgumentParser(description = 'Preprocess data') parser.add_argument('-i', '--input', help='input file path') parser.add_argument('-o', '--output', help='output file path') parser.add_argument('-t', '--task', choices=['minMax','zScore', 'equalWidth', 'equalDepth', 'removeMissing', 'fillMissing'], help='process something') parser.add_argument('-p', '--prop', help='property list. ex: age,weight') parser.add_argument('-b', '--bin', type=int, help='number of bins') args = parser.parse_args() main(args) |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 |
import argparse import pandas as pd import math def status(message): print(message) def remove_empty(data): status("Removing empty instance ...") colList = ['name', 'longName', 'foundingDate', 'population', 'capital', 'largestCity', 'area'] data.dropna(subset=colList, how='all', inplace=True) return data def remove_duplicate(data): status("Removing duplicate ...") colList = ['name', 'longName', 'foundingDate', 'population', 'capital', 'largestCity', 'area'] for x in colList: data = data[(~data[x].duplicated()) | data[x].isna()] return data def convert_area(data): status("Converting area ...") for index, row in data.iterrows(): if pd.notna(row['area']) and row['area'] != "nan" and row['area'] != "NaN" and row['area'] != "" and row['area'] != "?": col = str(row['area']) col = col.replace('or ', '') col = col.replace(',', '') if "mi" in col: col = col.replace('mi', '') data.loc[index, "area"] = str(round(float(col)/0.38610)) + "km" return data def remove_missing(prop, data): status("Removing missing {"+prop+"} ...") count = 0 for index, row in data.iterrows(): if pd.isna(row[prop]) or row[prop] == "nan" or row[prop] == "NaN" or row[prop] == "" or row[prop] == "?": data.drop(index, inplace=True) count += 1 print("Remove", count, "instance missing values") return data def read_file(path): status("Reading file ...") try: reader = pd.read_csv(path, sep="=", skiprows=8, names=["column", "value"]) data = pd.DataFrame(columns=['country', 'name', 'longName', 'foundingDate', 'population', 'capital', 'largestCity', 'area']) i = 0 for index, row in reader.iterrows(): if row[0] == "country": i += 1 data.loc[i, row[0]] = row[1] return data except: print("File not found!") def write_file(path, data): status("Writing file ...") try: data.to_csv(path, index=False) except: print("Cannot write a file!") def main(args): data = read_file(args.input) if data is not None: data = remove_empty(data) data = remove_duplicate(data) data = convert_area(data) data = remove_missing("area", data) write_file(args.output, data) status("Finish!") if __name__ == "__main__": parser = argparse.ArgumentParser(description = 'Preprocess data') parser.add_argument('-i', '--input', help='input file path') parser.add_argument('-o', '--output', help='output file path') args = parser.parse_args() main(args) |
Leave a Reply