In [1]:
import time
import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings("ignore")

from sklearn.preprocessing import MultiLabelBinarizer, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.metrics import (
    f1_score, jaccard_score, precision_score, recall_score, accuracy_score,
    classification_report
)

from sklearn.linear_model import LogisticRegression
from sklearn.multiclass import OneVsRestClassifier
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.multioutput import MultiOutputClassifier


In [4]:
# ------------------------------------------------------------
# 1) Загрузка
# ------------------------------------------------------------
import os
os.chdir('/home/konnilol/Documents/uni/mmo/pr5')

df = pd.read_csv("drug_consumption_transformed.csv")


In [5]:
# ------------------------------------------------------------
# 2) Парсинг мультиметки "Used in Over 10 Years"
# ------------------------------------------------------------

import re

def _norm_token(t: str) -> str:
    t = str(t).strip()
    t = re.sub(r'^[\[\]\(\)\{\}"\'‘’“”\s]+|[\[\]\(\)\{\}"\'‘’“”\s]+$', "", t)
    return t

def parse_labels(cell):
    if pd.isna(cell):
        return []
    raw = str(cell).strip().strip("[]")
    toks = [_norm_token(x) for x in re.split(r"[;,]", raw)]
    return [t for t in toks if t]

col = "Used over a Decade Ago"
assert col in df.columns, f"Нет колонки '{col}'"
y_labels = df[col].apply(parse_labels)

mlb = MultiLabelBinarizer()
Y = mlb.fit_transform(y_labels).astype("float32")
labels = mlb.classes_
print("Метки:", list(labels))


Метки: ['Alcohol', 'Amphet', 'Amyl', 'Benzos', 'Caff', 'Cannabis', 'Choc', 'Coke', 'Crack', 'Ecstasy', 'Heroin', 'Ketamine', 'LSD', 'Legalh', 'Meth', 'Mushrooms', 'Nicotine', 'Semer', 'VSA']


In [6]:
# ------------------------------------------------------------
# 3) Признаки
# ------------------------------------------------------------

feat_cols = [
    "Age", "Gender", "Education", "Country", "Ethnicity",
    "Nscore", "Escore", "Oscore", "Ascore", "Cscore",
    "Impulsive", "SS"
]
X = df[feat_cols].astype("float32")


In [7]:
# ------------------------------------------------------------
# 4) Train/Test split
# ------------------------------------------------------------

X_train, X_test, Y_train, Y_test = train_test_split(
    X, Y, test_size=0.2, random_state=42
)


In [8]:
# ------------------------------------------------------------
# 5) Препроцессинг
# ------------------------------------------------------------

imp = SimpleImputer(strategy="median")
sc = StandardScaler()

X_train = sc.fit_transform(imp.fit_transform(X_train))
X_test = sc.transform(imp.transform(X_test))


In [9]:
# ------------------------------------------------------------
# 6) Функция оценки
# ------------------------------------------------------------

def eval_model(name, model, X_test, Y_test):
    t0 = time.time()
    Y_pred = model.predict(X_test)
    t1 = time.time()

    micro = f1_score(Y_test, Y_pred, average="micro", zero_division=0)
    macro = f1_score(Y_test, Y_pred, average="macro", zero_division=0)
    jacc = jaccard_score(Y_test, Y_pred, average="samples", zero_division=0)

    print(f"\n=== {name} ===")
    print(f"Точность (Micro-F1): {micro:.4f}")
    print(f"Точность (Macro-F1): {macro:.4f}")
    print(f"Jaccard(samples):   {jacc:.4f}")
    print(f"Время предсказания: {t1 - t0:.4f} сек")

    return {
        "model": name,
        "micro_f1": micro,
        "macro_f1": macro,
        "jaccard": jacc,
        "time": t1 - t0
    }


results = []


In [10]:
# ------------------------------------------------------------
# 7) МЕТОД 1 — Logistic Regression (OvR)
# ------------------------------------------------------------
clf_lr = OneVsRestClassifier(
    LogisticRegression(max_iter=600, class_weight="balanced")
)
clf_lr.fit(X_train, Y_train)
results.append(eval_model("LogisticRegression (OvR)", clf_lr, X_test, Y_test))



=== LogisticRegression (OvR) ===
Точность (Micro-F1): 0.2364
Точность (Macro-F1): 0.1896
Jaccard(samples):   0.0650
Время предсказания: 0.0039 сек


In [11]:
# ------------------------------------------------------------
# 8) МЕТОД 2 — Linear SVM (OvR)
# ------------------------------------------------------------
clf_svm = OneVsRestClassifier(
    LinearSVC(class_weight="balanced")
)
clf_svm.fit(X_train, Y_train)
results.append(eval_model("Linear SVM (OvR)", clf_svm, X_test, Y_test))



=== Linear SVM (OvR) ===
Точность (Micro-F1): 0.2361
Точность (Macro-F1): 0.1894
Jaccard(samples):   0.0659
Время предсказания: 0.0024 сек


In [12]:
# ------------------------------------------------------------
# 9) МЕТОД 3 — RandomForest (MultiOutput)
# ------------------------------------------------------------
clf_rf = MultiOutputClassifier(
    RandomForestClassifier(
        n_estimators=300,
        max_depth=None,
        class_weight="balanced_subsample",
        n_jobs=-1
    )
)
clf_rf.fit(X_train, Y_train)
results.append(eval_model("RandomForest (MultiOutput)", clf_rf, X_test, Y_test))



=== RandomForest (MultiOutput) ===
Точность (Micro-F1): 0.0737
Точность (Macro-F1): 0.0399
Jaccard(samples):   0.0107
Время предсказания: 0.8722 сек


In [13]:
# ------------------------------------------------------------
# 10) МЕТОД 4 — MLP (нейросеть)
# ------------------------------------------------------------
clf_mlp = MLPClassifier(
    hidden_layer_sizes=(128, 64),
    activation='relu',
    max_iter=300,
    learning_rate_init=0.001
)
clf_mlp.fit(X_train, Y_train)
results.append(eval_model("MLPClassifier", clf_mlp, X_test, Y_test))




=== MLPClassifier ===
Точность (Micro-F1): 0.1779
Точность (Macro-F1): 0.1146
Jaccard(samples):   0.0275
Время предсказания: 0.0005 сек


In [14]:
# ------------------------------------------------------------
# 11) МЕТОД 5 — kNN (MultiOutput)
# ------------------------------------------------------------
clf_knn = MultiOutputClassifier(
    KNeighborsClassifier(n_neighbors=7)
)
clf_knn.fit(X_train, Y_train)
results.append(eval_model("kNN", clf_knn, X_test, Y_test))



=== kNN ===
Точность (Micro-F1): 0.0970
Точность (Macro-F1): 0.0460
Jaccard(samples):   0.0145
Время предсказания: 0.3085 сек


In [15]:
# ------------------------------------------------------------
# 12) Итоговая таблица
# ------------------------------------------------------------
print("\n\n=== СВОДНАЯ ТАБЛИЦА ===")
df_res = pd.DataFrame(results)
print(df_res.sort_values("micro_f1", ascending=False))




=== СВОДНАЯ ТАБЛИЦА ===
                        model  micro_f1  macro_f1   jaccard      time
0    LogisticRegression (OvR)  0.236388  0.189626  0.065041  0.003909
1            Linear SVM (OvR)  0.236068  0.189358  0.065917  0.002414
3               MLPClassifier  0.177924  0.114591  0.027516  0.000480
4                         kNN  0.097046  0.045981  0.014492  0.308500
2  RandomForest (MultiOutput)  0.073733  0.039861  0.010684  0.872202


### Качество моделей хуже,чем в других задачах, но это может быть связано просто с исходными данными. И полученных лучшая: Logistic Regression (OvR), второе место по времени выполнения.