from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn import tree

class_names_ordered = ["0", "6", "12", "16", "18"]

columns = df.columns[10:]
ratings = df["age_rating"]
features = df[columns]
features_train, features_test, ratings_train, ratings_test = train_test_split(features, ratings, test_size=0.2, random_state=42)

print("[", ", ".join(columns.tolist()[0:15]), "...", "]")
print("\nNumbers of different features:", len(columns.to_list()))
print("Training set size:", len(features_train))
print("Test set size:", len(features_test))

[ Düstere Atmosphäre, Angedeutete Gewalt, Schimpfwörter, Comic-Gewalt, Drastische Gewalt, Sexualisierte Gewalt, Drogen, Glücksspielthematik, Belastende Themen, Handlungsdruck, Schreckmomente, Horror, Alkohol, Druck zum Vielspielen, Problematische Werbeinhalte ... ]

Numbers of different features: 29
Training set size: 34418
Test set size: 8605

# Training initial decision tree
clf = tree.DecisionTreeClassifier()
clf.fit(features_train, ratings_train)

DecisionTreeClassifier()

DecisionTreeClassifier()

ratings_train_pred = clf.predict(features_train)
ratings_test_pred = clf.predict(features_test)
show_accuracy_plots(ratings_train, ratings_train_pred, ratings_test, ratings_test_pred)

# Filters out rating higher than 0 with no descriptors
total_descriptors = df[columns].sum(axis=1)
df = df[~((df["age_rating"] != "0") & (total_descriptors == 0))]

print("Filtered dataset size:", len(df))

ratings = df["age_rating"]
features = df[columns]
features_train, features_test, ratings_train, ratings_test = train_test_split(features, ratings, test_size=0.2, random_state=42)

Filtered dataset size: 20249

tree.plot_tree(clf, class_names=class_names_ordered, filled=True)
plt.show()
print("Numbers of leaves:", clf.get_n_leaves())
print("Numbers of nodes:", clf.tree_.node_count)

Numbers of leaves: 134
Numbers of nodes: 267

clf = tree.DecisionTreeClassifier(max_leaf_nodes=25, max_depth=10)

# Pruning mit cost complexity pruning
path = clf.cost_complexity_pruning_path(features_train, ratings_train)
ccp_alphas = path.ccp_alphas
ccp_alphas = [alpha for alpha in ccp_alphas if alpha >= 0.0003 and alpha <= 0.03]

clfs = []
ccp_alphas = [alpha for alpha in ccp_alphas if alpha >= 0.0003 and alpha <= 0.03]

for ccp_alpha in ccp_alphas:
    clf = tree.DecisionTreeClassifier(random_state=0, ccp_alpha=ccp_alpha)
    clf.fit(features_train, ratings_train)
    clfs.append(clf)
node_counts = [clf.tree_.node_count for clf in clfs]
depth = [clf.tree_.max_depth for clf in clfs]

clf = tree.DecisionTreeClassifier(random_state=0,
                                  ccp_alpha=0.004) # <-
clf.fit(features_train, ratings_train)
ratings_train_pred = clf.predict(features_train)
ratings_test_pred = clf.predict(features_test)
show_accuracy_plots(ratings_train, ratings_train_pred, ratings_test, ratings_test_pred)
print("Numbers of leaves:", clf.get_n_leaves(), "Numbers of nodes:", clf.tree_.node_count)

Numbers of leaves: 9 Numbers of nodes: 17

number_of_descriptors = df.iloc[:, 10:].sum(axis=1)
df.insert(10, "number_of_descriptors", number_of_descriptors)

clf = tree.DecisionTreeClassifier(random_state=0, ccp_alpha=0.004)
clf.fit(features_train, ratings_train)
ratings_train_pred = clf.predict(features_train)
ratings_test_pred = clf.predict(features_test)
show_accuracy_plots(ratings_train, ratings_train_pred, ratings_test, ratings_test_pred)
print("Numbers of leaves:", clf.get_n_leaves(), "\nNumbers of nodes:", clf.tree_.node_count)

Numbers of leaves: 5 
Numbers of nodes: 9

clf = tree.DecisionTreeClassifier(random_state=0, ccp_alpha=0.004,
                                  class_weight="balanced") # <-
clf.fit(features_train, ratings_train)
ratings_train_pred = clf.predict(features_train)
ratings_test_pred = clf.predict(features_test)
show_accuracy_plots(ratings_train, ratings_train_pred, ratings_test, ratings_test_pred)
print("Numbers of leaves:", clf.get_n_leaves(), "\nNumbers of nodes:", clf.tree_.node_count)

Numbers of leaves: 12 
Numbers of nodes: 23

fig, ax = plt.subplots(figsize=(22, 10))
tree.plot_tree(clf, filled=True, class_names=class_names_ordered, feature_names=columns.tolist(), label="all")
plt.show()

show_accuracy_plots(ratings_train, ratings_train_pred, ratings_test, ratings_test_pred, True)

Total weighted error: 0.9319326959113436
Max weighted error: 19.999999999999996
Total weighted error: 1.091355960564243
Max weighted error: 20.0

columns = df.columns.difference([df.columns[-2], df.columns[-7]])[0:-17]
ratings = df["esrb_rating"]
class_names_ordered = ["E", "E10+", "T", "M", "AO"]
features = df[columns]
features_train, features_test, ratings_train, ratings_test = train_test_split(features, ratings, test_size=0.2, random_state=42)

print("[", ", ".join(columns.tolist()[0:15]), "...", "]")
print("\nNumbers of different features:", len(columns.to_list()))
print("Training set size:", len(features_train))
print("Test set size:", len(features_test))

[ Alcohol Reference, Alcohol and Tobacco Reference, Animated Blood, Animated Blood and Gore, Animated Violence, Blood, Blood and Gore, Cartoon Violence, Comic Mischief, Crude Humor, Drug Reference, Drug and Alcohol Reference, Edutainment, Fantasy Violence, Gambling ... ]

Numbers of different features: 56
Training set size: 36712
Test set size: 9179

# Training initial decision tree
clf = tree.DecisionTreeClassifier()
clf.fit(features_train, ratings_train)

DecisionTreeClassifier()

DecisionTreeClassifier()

ratings_train_pred = clf.predict(features_train)
ratings_test_pred = clf.predict(features_test)
show_accuracy_plots(ratings_train, ratings_train_pred, ratings_test, ratings_test_pred)

print("Numbers of leaves:", clf.get_n_leaves())
print("Numbers of nodes:", clf.tree_.node_count)
tree.plot_tree(clf, class_names=class_names_ordered, filled=True)
plt.show()

Numbers of leaves: 959
Numbers of nodes: 1917

# Limit numbers of nodes

clf = tree.DecisionTreeClassifier(random_state=0, ccp_alpha=0.006)
clf.fit(features_train, ratings_train)

DecisionTreeClassifier(ccp_alpha=0.006, random_state=0)

DecisionTreeClassifier(ccp_alpha=0.006, random_state=0)

Numbers of leaves: 11 
Numbers of nodes: 21

clf = tree.DecisionTreeClassifier(random_state=0, ccp_alpha=0.006,
                                  class_weight="balanced") # <-
clf.fit(features_train, ratings_train)
ratings_train_pred = clf.predict(features_train)
ratings_test_pred = clf.predict(features_test)
show_accuracy_plots(ratings_train, ratings_train_pred, ratings_test, ratings_test_pred)

{'0': 0, '6': 1, '12': 2, '16': 3, '18': 4}

show_accuracy_plots(ratings_train, ratings_train_pred, ratings_test, ratings_test_pred, True)

Data Literacy SS25¶

Analyse, Visualisierung und Vorhersage der Alterskennzeichen von Videospielen¶

Ying-Cheng Yang¶

Inhaltsverzeichnis¶

🇩🇪 Unterhaltungssoftware Selbstkontrolle (USK)¶

Datenerhebung¶

Fragestellung¶

Entscheidungsbaum¶

Preprocessing¶

Das sieht gar nicht gut aus!¶

Was ist hier schief gelaufen?¶

Preprocessing Teil 2¶

Das sieht viel besser aus!!¶

Ein großer Baum¶

Die Anzahl der Blätter und Knoten begrenzen¶

Cost Complexity Pruning und Alpha-Wert¶

Overfitting¶

Class Weights¶

Wir haben die Anzahl der Blätter und Knoten im Baum um fast 10x reduziert, tortzdem verlieren wir im Test nur 1.1% der Genaulichkeit!!¶

Bessere Rechnen der Genauigkeit¶

Distanzmatirx¶

🇺🇸 + 🇨🇦 Entertainment Software Rating Board (ESRB)¶

Datensatz¶

Entscheidungsbaum (diesmal mit ESRB)¶

Pruning the tree¶

1911 Knoten -> 21 Knoten, 91x Reduktion!¶

Class Weights¶

Gewichtete Konfusionsmatrix und Genauigkeit¶

Fußnoten¶

Was kann man noch mit diesen Daten machen?¶

Bessere Klassifikation in Verbindung mit Spielgeneren¶

Features zusammengruppieren¶

	title_name	age_rating	Comic-Gewalt	Drastische Gewalt	Sexualisierte Gewalt	...	Gewalt	In-Game-Käufe
0	Lonely Mountains: Snow Riders	0	0	0	0	...	0	1
1	Wobbly Life	6	1	0	0	...	0	0
2	Nicktoons & The Dice of Destiny	12	1	0	0	...	0	0
3	United Assault – Final Stand	16	0	0	0	...	1	0
4	High On Life	18	0	1	1	...	0	0