问KeyError:1 python ID3算法？

提问于 2019-10-27 22:54:09
回答 0关注 0查看 350
我想实现一个有连续值的决策树ID3算法，第一次输出的最优特征值没有问题，迭代的话就会报错，求帮忙
def load_data():
    # Only include the first 8 descriptive features and the target label
    data = pd.read_csv("heart.csv", usecols=[
                       "age", "sex", "cp", "trestbps", "chol", "fbs", "restecg", "thalach", "target"]) #filter the features we are interested in
    return data


def describe_partitions(ps):
    for target, p in sorted(ps.items(), key=lambda k: k[0]):
        print(f"{target}\t{p.shape[0]}")
    print("")

# more frequently,higher entropy
def entropy(data):
    counts = data["target"].value_counts()
    # print(counts)
    """
        Similar to doing the following manually:
            counts = {}
            for val in data["target"]:
                counts[val] = counts.get(val, 0) + 1
    """
    total = data["target"].shape[0]
    # print(data)
    sum = 0.
    for count in counts:
        p = count/total
        sum += p * math.log(p) 
    return - sum


def partitions(data, feature, thresholds):
    def find_threshold(feature, val):
        # Guaranteed to find a threshold somewhere between min and max
        for t in reversed(thresholds[feature]):
            if val >= t:
                return t
        raise Exception("Unexpected return without threshold")

    features = data.columns
    ps = {}
    for j, val in enumerate(data[feature]):
        # Treat categorical and continuous feature values differently
        if feature in thresholds:
            val = find_threshold(feature, val)
        p = ps.get(val, pd.DataFrame(columns=features))
        ps[val] = p.append(data.loc[j, features])
    return ps


def create_thresholds(data, names, nstds=3):
    # Assume the data is normally-distributed
    thresholds = {}
    for feature in names:
        col = data[feature]
        mint, maxt = np.min(col), np.max(col)
        mean, stddev = np.mean(col), np.std(col)
        ts = [mint]
        for n in range(-nstds - 1, nstds):
            t = round(n * stddev + mean)
            if t >= mint and t <= maxt:
                ts.append(t)
        thresholds[feature] = ts
    return thresholds


def gain(data, H, feature, thresholds):
    ps = partitions(data, feature, thresholds)
    # describe_partitions(ps)
    sum = 0.
    for p in ps.values():
        if feature in p.columns:
            sum += (p.shape[0] / data.shape[0]) * entropy(p)
    return H - sum

# return if the target_attr have only one value in data set
def unique(data):
    value_count=data['target'].value_counts()
    total=data['target'].shape[0]
    if value_count[0]==total:
        return data['target'].values[0],1
    else:
        return False,0

# return maximum occurence of a unique value of target_attr in data
def common_value(data):
    c=data['target'].value_counts()
    sc=sorted(c.iteritems(), key=operator.itemgetter(1), reverse=True)
    return sc[0][0]

def best_attribute(data,attr,thresholds):
    Gains=np.zeros(len(attr))
    h=entropy(data)
    for i,feature in enumerate(data.columns[attr]):
        Gains[i]=gain(data,h,feature,thresholds)
    best=data.columns[np.argmax(Gains)+1] #'+1' is because the first column is 'index',so the attribute features start at 1
    return best,np.argmax(Gains)+1

def ID3(data,attr,thresholds):
    tree={}
    record,truth=unique(data)

    # all examples have same value of attribute in the data
    if truth!=0:
        tree = record
    # no more attributes to be considered
    elif len(attr)==0:
        # return tree with single node root labelled with most common value of target_attr in the data 
        tree=common_value(data) 
    else:
        A,Anumber=best_attribute(data,attr,thresholds) 
        # print(A)
        tree={A:{}}
        values=data[A].values
        attr.pop()
        for vi in values:
            #add subtree to tree
            tree[A][vi]=ID3(data[data[A] == vi].drop(A, axis=1),attr,thresholds)
        # print(examples)
    return tree
    
    def main():
    data = load_data()
    # Split into training and test data sets
    train_data, test_data = train_test_split(data, test_size=0.25)
    # Compute the total entropy for the full data set with respect to the target label
    H = entropy(train_data)
    print(f"Total Entropy: {H}")
    # Generate threshold values for the continuous value descriptive features
    thresholds = create_thresholds(
        train_data, ["age", "chol", "trestbps", "thalach"], nstds=3)
    # Compute the level=0 information gain when partitioned on each descriptive feature
    IG = np.zeros(8)
    for i, feature in enumerate(data.columns[:8]):
        IG[i] = gain(train_data, H, feature, thresholds)
    
    # Print the best one (at the level=0)
    print(IG)
    A=data.columns[np.argmax(IG)]
    print("Best IG feature: "+A)
    l=list(range(1,9))
    # print(train_data)
    DecisionTree=ID3(train_data,l,thresholds)
    print(DecisionTree)
机器学习