In [1]:
import re
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MultiLabelBinarizer
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.multiclass import OneVsRestClassifier
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score,
    f1_score, jaccard_score, classification_report, fbeta_score
)

np.random.seed(42)

import warnings
warnings.filterwarnings("ignore")


In [2]:
# ------------------------------------------------------------
# 1) Загрузка данных
# ------------------------------------------------------------
df = pd.read_csv("drug_consumption_transformed.csv")


In [3]:
# 2) Парсинг мультиметки "Used in Last Year"
# ------------------------------------------------------------
def _norm_token(t: str) -> str:
    t = str(t).strip()
    t = re.sub(r'^[\[\]\(\)\{\}"\'‘’“”\s]+|[\[\]\(\)\{\}"\'‘’“”\s]+$', "", t)
    return t

def parse_labels(cell):
    if pd.isna(cell):
        return []
    if isinstance(cell, (list, tuple, set, np.ndarray)):
        toks = [_norm_token(x) for x in cell]
    else:
        raw = str(cell).strip().strip("[]")
        toks = [_norm_token(x) for x in re.split(r"[;,]", raw)]
    return [t for t in toks if t]


In [4]:
assert "Used in Last Year" in df.columns, "Нет колонки 'Used in Last Year'!"
y_labels = df["Used in Last Year"].apply(parse_labels)

mlb = MultiLabelBinarizer()
Y_full = mlb.fit_transform(y_labels).astype("float32")
classes_all = mlb.classes_
print("Все метки (вещества):", list(classes_all))


Все метки (вещества): ['Alcohol', 'Amphet', 'Amyl', 'Benzos', 'Caff', 'Cannabis', 'Choc', 'Coke', 'Crack', 'Ecstasy', 'Heroin', 'Ketamine', 'LSD', 'Legalh', 'Meth', 'Mushrooms', 'Nicotine', 'Semer', 'VSA']


In [5]:
# ------------------------------------------------------------
# 3) Признаки
# ------------------------------------------------------------
feat_cols = [
    "Age", "Gender", "Education", "Country", "Ethnicity",
    "Nscore", "Escore", "Oscore", "Ascore", "Cscore",
    "Impulsive", "SS"
]
X_full = df[feat_cols].astype("float32").copy()


In [6]:
# ------------------------------------------------------------
# 4) Train/Test сплит c стратификацией по числу меток
# ------------------------------------------------------------
card = Y_full.sum(axis=1)
bins = np.clip(card, 0, 8)

X_trn, X_tst, Y_trn_all, Y_tst_all = train_test_split(
    X_full, Y_full,
    test_size=0.2,
    random_state=42,
    stratify=bins
)


In [7]:
# ------------------------------------------------------------
# 5) Отбор слишком редких меток
# ------------------------------------------------------------
pos = Y_trn_all.sum(axis=0)
neg = Y_trn_all.shape[0] - pos

# минимум 20 положительных и 20 отрицательных
keep = (pos >= 20) & (neg >= 20)

labels_kept    = classes_all[keep]
labels_dropped = classes_all[~keep]

print("Оставляем метки:", list(labels_kept))
if len(labels_dropped):
    print("Выкидываем слишком редкие:", list(labels_dropped))

Y_trn = Y_trn_all[:, keep]
Y_tst = Y_tst_all[:, keep]


Оставляем метки: ['Alcohol', 'Amphet', 'Amyl', 'Benzos', 'Caff', 'Cannabis', 'Choc', 'Coke', 'Crack', 'Ecstasy', 'Heroin', 'Ketamine', 'LSD', 'Legalh', 'Meth', 'Mushrooms', 'Nicotine', 'VSA']
Выкидываем слишком редкие: ['Semer']


In [8]:
# ------------------------------------------------------------
# 6) Препроцессинг + внутренний вал-сплит
# ------------------------------------------------------------
imp = SimpleImputer(strategy="median")
scaler = StandardScaler()

X_trn_imp = imp.fit_transform(X_trn)
X_tst_imp = imp.transform(X_tst)

X_trn_sc = scaler.fit_transform(X_trn_imp)
X_tst_sc = scaler.transform(X_tst_imp)

card_tr = Y_trn.sum(axis=1)
bins_tr = np.clip(card_tr, 0, 8)

X_tr, X_val, Y_tr, Y_val = train_test_split(
    X_trn_sc, Y_trn,   # <-- тут была ошибка, должно быть Y_trn
    test_size=0.2,
    random_state=42,
    stratify=bins_tr
)


In [9]:
# ------------------------------------------------------------
# 7) Логистическая регрессия One-vs-Rest
# ------------------------------------------------------------
base_clf = LogisticRegression(
    max_iter=500,
    solver="saga",
    penalty="l2",
    class_weight="balanced"
)

clf = OneVsRestClassifier(base_clf, n_jobs=-1)
clf.fit(X_tr, Y_tr)




In [10]:
# ------------------------------------------------------------
# 8) Прогнозы вероятностей
# ------------------------------------------------------------
P_tr = clf.predict_proba(X_tr)
P_val = clf.predict_proba(X_val)
P_te = clf.predict_proba(X_tst_sc)

P_tr = np.asarray(P_tr)
P_val = np.asarray(P_val)
P_te = np.asarray(P_te)


In [11]:
# ------------------------------------------------------------
# 9) Подбор порогов по F1 на валидации
# ------------------------------------------------------------
def best_label_thresholds(Y_true, P, beta=1.0):
    grid = np.linspace(0.05, 0.9, 36)
    T = []
    for j in range(P.shape[1]):
        yj, pj = Y_true[:, j], P[:, j]
        if yj.sum() == 0:
            T.append(0.5); continue
        best_t, best_s = 0.5, -1.0
        for t in grid:
            pred = (pj >= t).astype(int)
            s = fbeta_score(yj, pred, beta=beta, average="binary", zero_division=0)
            if s > best_s:
                best_t, best_s = t, s
        T.append(float(best_t))
    return np.array(T, dtype="float32")

def best_global_threshold_fbeta(Y_true, P, beta=1.0):
    grid = np.linspace(0.05, 0.9, 36)
    scored = [
        (t, fbeta_score(Y_true, (P >= t).astype(int), beta=beta,
                        average="micro", zero_division=0))
        for t in grid
    ]
    return max(scored, key=lambda x: x[1])


In [12]:
thr_vec = best_label_thresholds(Y_val, P_val, beta=1.0)
t_glob, s_glob = best_global_threshold_fbeta(Y_val, P_val, beta=1.0)

thr_min = 0.2
thr_vec = np.maximum(thr_vec, thr_min)

print("Глобальный порог (micro-F1):", t_glob, "score:", s_glob)
print("Пер-меточные пороги:", dict(zip(labels_kept, np.round(thr_vec, 3))))


Глобальный порог (micro-F1): 0.6328571428571429 score: 0.2895174708818636
Пер-меточные пороги: {'Alcohol': 0.56, 'Amphet': 0.73, 'Amyl': 0.511, 'Benzos': 0.487, 'Caff': 0.851, 'Cannabis': 0.56, 'Choc': 0.681, 'Coke': 0.706, 'Crack': 0.9, 'Ecstasy': 0.56, 'Heroin': 0.536, 'Ketamine': 0.584, 'LSD': 0.706, 'Legalh': 0.56, 'Meth': 0.609, 'Mushrooms': 0.487, 'Nicotine': 0.511, 'VSA': 0.706}


In [13]:
def decode_with_fallback(P, thr_vec, t_glob):
    pred = (P >= thr_vec).astype(int)
    row_max = P.max(axis=1)
    row_arg = P.argmax(axis=1)

    empties = np.where(pred.sum(axis=1) == 0)[0]
    use = empties[row_max[empties] >= max(t_glob, 0.3)]
    if use.size:
        pred[use, row_arg[use]] = 1
    return pred

Yp_tr = decode_with_fallback(P_tr, thr_vec, t_glob)
Yp_te = decode_with_fallback(P_te, thr_vec, t_glob)


In [14]:
# ------------------------------------------------------------
# 10) Оценка качества
# ------------------------------------------------------------
def eval_multilabel(name, Y_true, Y_pred, labels):
    subset_acc = accuracy_score(Y_true, Y_pred)
    micro_p = precision_score(Y_true, Y_pred, average="micro", zero_division=0)
    micro_r = recall_score(Y_true, Y_pred, average="micro", zero_division=0)
    micro_f1 = f1_score(Y_true, Y_pred, average="micro", zero_division=0)
    macro_p = precision_score(Y_true, Y_pred, average="macro", zero_division=0)
    macro_r = recall_score(Y_true, Y_pred, average="macro", zero_division=0)
    macro_f1 = f1_score(Y_true, Y_pred, average="macro", zero_division=0)
    jacc = jaccard_score(Y_true, Y_pred, average="samples", zero_division=0)

    print(f"\n=== {name} — Общие метрики ===")
    print(f"Subset Accuracy: {subset_acc:.4f}")
    print(f"Micro  P/R/F1 : {micro_p:.4f} / {micro_r:.4f} / {micro_f1:.4f}")
    print(f"Macro  P/R/F1 : {macro_p:.4f} / {macro_r:.4f} / {macro_f1:.4f}")
    print(f"Jaccard (samples): {jacc:.4f}")

    print(f"\n=== {name} — Отчёт по меткам ===")
    print(classification_report(
        Y_true, Y_pred,
        target_names=labels,
        digits=4,
        zero_division=0
    ))


In [15]:
eval_multilabel("TRAIN — last_year_LogReg", Y_tr, Yp_tr, labels_kept)
eval_multilabel("TEST  — last_year_LogReg", Y_tst, Yp_te, labels_kept)



=== TRAIN — last_year_LogReg — Общие метрики ===
Subset Accuracy: 0.1816
Micro  P/R/F1 : 0.2150 / 0.5726 / 0.3127
Macro  P/R/F1 : 0.1926 / 0.5314 / 0.2705
Jaccard (samples): 0.1131

=== TRAIN — last_year_LogReg — Отчёт по меткам ===
              precision    recall  f1-score   support

     Alcohol     0.1760    0.4800    0.2575       125
      Amphet     0.2759    0.4308    0.3363       130
        Amyl     0.1330    0.8000    0.2281        65
      Benzos     0.2242    0.6886    0.3382       167
        Caff     0.0597    0.0909    0.0721        44
    Cannabis     0.1797    0.4627    0.2589       134
        Choc     0.0536    0.1818    0.0828        33
        Coke     0.3046    0.3155    0.3099       168
       Crack     0.0000    0.0000    0.0000        36
     Ecstasy     0.3137    0.6573    0.4247       178
      Heroin     0.0879    0.7838    0.1580        37
    Ketamine     0.1468    0.6000    0.2359        80
         LSD     0.3111    0.5882    0.4070       119
      Leg

#### Модель нормально предсказывает частые вещества, но любит давать ложное положительное при редких. При попытке повысить precision, все метрики упали значительно.