首页
学习
活动
专区
工具
TVP
发布
社区首页 >问答首页 >KeyError:1 python ID3算法?

KeyError:1 python ID3算法?

提问于 2019-10-27 22:54:09
回答 0关注 0查看 350

我想实现一个有连续值的决策树ID3算法,第一次输出的最优特征值没有问题,迭代的话就会报错,求帮忙

代码语言:js
复制
def load_data():
    # Only include the first 8 descriptive features and the target label
    data = pd.read_csv("heart.csv", usecols=[
                       "age", "sex", "cp", "trestbps", "chol", "fbs", "restecg", "thalach", "target"]) #filter the features we are interested in
    return data


def describe_partitions(ps):
    for target, p in sorted(ps.items(), key=lambda k: k[0]):
        print(f"{target}\t{p.shape[0]}")
    print("")

# more frequently,higher entropy
def entropy(data):
    counts = data["target"].value_counts()
    # print(counts)
    """
        Similar to doing the following manually:
            counts = {}
            for val in data["target"]:
                counts[val] = counts.get(val, 0) + 1
    """
    total = data["target"].shape[0]
    # print(data)
    sum = 0.
    for count in counts:
        p = count/total
        sum += p * math.log(p) 
    return - sum


def partitions(data, feature, thresholds):
    def find_threshold(feature, val):
        # Guaranteed to find a threshold somewhere between min and max
        for t in reversed(thresholds[feature]):
            if val >= t:
                return t
        raise Exception("Unexpected return without threshold")

    features = data.columns
    ps = {}
    for j, val in enumerate(data[feature]):
        # Treat categorical and continuous feature values differently
        if feature in thresholds:
            val = find_threshold(feature, val)
        p = ps.get(val, pd.DataFrame(columns=features))
        ps[val] = p.append(data.loc[j, features])
    return ps


def create_thresholds(data, names, nstds=3):
    # Assume the data is normally-distributed
    thresholds = {}
    for feature in names:
        col = data[feature]
        mint, maxt = np.min(col), np.max(col)
        mean, stddev = np.mean(col), np.std(col)
        ts = [mint]
        for n in range(-nstds - 1, nstds):
            t = round(n * stddev + mean)
            if t >= mint and t <= maxt:
                ts.append(t)
        thresholds[feature] = ts
    return thresholds


def gain(data, H, feature, thresholds):
    ps = partitions(data, feature, thresholds)
    # describe_partitions(ps)
    sum = 0.
    for p in ps.values():
        if feature in p.columns:
            sum += (p.shape[0] / data.shape[0]) * entropy(p)
    return H - sum

# return if the target_attr have only one value in data set
def unique(data):
    value_count=data['target'].value_counts()
    total=data['target'].shape[0]
    if value_count[0]==total:
        return data['target'].values[0],1
    else:
        return False,0

# return maximum occurence of a unique value of target_attr in data
def common_value(data):
    c=data['target'].value_counts()
    sc=sorted(c.iteritems(), key=operator.itemgetter(1), reverse=True)
    return sc[0][0]

def best_attribute(data,attr,thresholds):
    Gains=np.zeros(len(attr))
    h=entropy(data)
    for i,feature in enumerate(data.columns[attr]):
        Gains[i]=gain(data,h,feature,thresholds)
    best=data.columns[np.argmax(Gains)+1] #'+1' is because the first column is 'index',so the attribute features start at 1
    return best,np.argmax(Gains)+1

def ID3(data,attr,thresholds):
    tree={}
    record,truth=unique(data)

    # all examples have same value of attribute in the data
    if truth!=0:
        tree = record
    # no more attributes to be considered
    elif len(attr)==0:
        # return tree with single node root labelled with most common value of target_attr in the data 
        tree=common_value(data) 
    else:
        A,Anumber=best_attribute(data,attr,thresholds) 
        # print(A)
        tree={A:{}}
        values=data[A].values
        attr.pop()
        for vi in values:
            #add subtree to tree
            tree[A][vi]=ID3(data[data[A] == vi].drop(A, axis=1),attr,thresholds)
        # print(examples)
    return tree
    
    def main():
    data = load_data()
    # Split into training and test data sets
    train_data, test_data = train_test_split(data, test_size=0.25)
    # Compute the total entropy for the full data set with respect to the target label
    H = entropy(train_data)
    print(f"Total Entropy: {H}")
    # Generate threshold values for the continuous value descriptive features
    thresholds = create_thresholds(
        train_data, ["age", "chol", "trestbps", "thalach"], nstds=3)
    # Compute the level=0 information gain when partitioned on each descriptive feature
    IG = np.zeros(8)
    for i, feature in enumerate(data.columns[:8]):
        IG[i] = gain(train_data, H, feature, thresholds)
    
    # Print the best one (at the level=0)
    print(IG)
    A=data.columns[np.argmax(IG)]
    print("Best IG feature: "+A)
    l=list(range(1,9))
    # print(train_data)
    DecisionTree=ID3(train_data,l,thresholds)
    print(DecisionTree)

回答

和开发者交流更多问题细节吧,去 写回答
相关文章

相似问题

相关问答用户
领券
问题归档专栏文章快讯文章归档关键词归档开发者手册归档开发者手册 Section 归档