In [1]:
import re
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MultiLabelBinarizer
from sklearn.impute import SimpleImputer
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score,
    f1_score, jaccard_score, classification_report
)

import tensorflow as tf
from tensorflow.keras import layers, models, callbacks, optimizers, initializers


# Фиксируем сиды
tf.keras.utils.set_random_seed(42)
np.random.seed(42)


2025-11-18 14:07:56.280405: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1763464076.290369  156871 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1763464076.293437  156871 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1763464076.302274  156871 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1763464076.302284  156871 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1763464076.302285  156871 computation_placer.cc:177] computation placer alr

In [2]:
# ------------------------------------------------------------
# 1) Загрузка
# ------------------------------------------------------------
df = pd.read_csv("drug_consumption_transformed.csv")


In [3]:
# ------------------------------------------------------------
# 2) Парсинг мультиметки "Used in Last Week"
# ------------------------------------------------------------
def _norm_token(t: str) -> str:
    t = str(t).strip()
    t = re.sub(r'^[\[\]\(\)\{\}"\'‘’“”\s]+|[\[\]\(\)\{\}"\'‘’“”\s]+$', "", t)
    return t

def parse_labels(cell):
    if pd.isna(cell):
        return []
    if isinstance(cell, (list, tuple, set, np.ndarray)):
        toks = [_norm_token(x) for x in cell]
    else:
        raw = str(cell).strip().strip("[]")
        toks = [_norm_token(x) for x in re.split(r"[;,]", raw)]
    return [t for t in toks if t]


In [4]:
assert "Used in Last Week" in df.columns, "Нет колонки 'Used in Last Week'!"
y_labels = df["Used in Last Week"].apply(parse_labels)

mlb = MultiLabelBinarizer()
Y_full = mlb.fit_transform(y_labels).astype("float32")
classes_all = mlb.classes_
print("Все метки (вещества):", list(classes_all))


Все метки (вещества): ['Alcohol', 'Amphet', 'Amyl', 'Benzos', 'Caff', 'Cannabis', 'Choc', 'Coke', 'Crack', 'Ecstasy', 'Heroin', 'Ketamine', 'LSD', 'Legalh', 'Meth', 'Mushrooms', 'Nicotine', 'VSA']


In [5]:
# ------------------------------------------------------------
# 3) Признаки
# ------------------------------------------------------------
feat_cols = [
    "Age", "Gender", "Education", "Country", "Ethnicity",
    "Nscore", "Escore", "Oscore", "Ascore", "Cscore",
    "Impulsive", "SS"
]
X_full = df[feat_cols].astype("float32").copy()


In [6]:
# ------------------------------------------------------------
# 4) Train/Test сплит с "стратификацией" по числу меток
# ------------------------------------------------------------
card = Y_full.sum(axis=1)
bins = np.clip(card, 0, 6)

X_trn, X_tst, Y_trn_all, Y_tst_all = train_test_split(
    X_full, Y_full,
    test_size=0.2,
    random_state=42,
    stratify=bins
)


In [7]:
# ------------------------------------------------------------
# 5) Удаляем совсем редкие метки
# ------------------------------------------------------------
pos = Y_trn_all.sum(axis=0)
neg = Y_trn_all.shape[0] - pos
keep = (pos >= 10) & (neg >= 10)

labels_kept = classes_all[keep]
labels_dropped = classes_all[~keep]
print("Оставляем метки:", list(labels_kept))
if len(labels_dropped):
    print("Выкидываем слишком редкие:", list(labels_dropped))

Y_trn = Y_trn_all[:, keep]
Y_tst = Y_tst_all[:, keep]


Оставляем метки: ['Alcohol', 'Amphet', 'Amyl', 'Benzos', 'Caff', 'Cannabis', 'Choc', 'Coke', 'Ecstasy', 'Heroin', 'Ketamine', 'LSD', 'Legalh', 'Meth', 'Mushrooms', 'Nicotine', 'VSA']
Выкидываем слишком редкие: ['Crack']


In [8]:
# ------------------------------------------------------------
# 6) Препроцессинг признаков + внутренний вал-сплит
# ------------------------------------------------------------
imp = SimpleImputer(strategy="median")
scaler = StandardScaler()

X_trn_imp = imp.fit_transform(X_trn)
X_tst_imp = imp.transform(X_tst)

X_trn_sc = scaler.fit_transform(X_trn_imp)
X_tst_sc = scaler.transform(X_tst_imp)

card_tr = Y_trn.sum(axis=1)
bins_tr = np.clip(card_tr, 0, 6)

X_tr, X_val, Y_tr, Y_val = train_test_split(
    X_trn_sc, Y_trn,          # <-- здесь было Y_tr, заменили на Y_trn
    test_size=0.2,
    random_state=42,
    stratify=bins_tr
)


In [9]:
# ------------------------------------------------------------
# 7) pos_weight + bias init для выхода
# ------------------------------------------------------------
eps = 1e-3
pos_lab = Y_tr.sum(axis=0)
neg_lab = Y_tr.shape[0] - pos_lab

pos_frac = np.clip(
    pos_lab / np.maximum(pos_lab + neg_lab, 1.0),
    eps,
    1.0 - eps
)
out_bias = np.log(pos_frac / (1.0 - pos_frac)).astype("float32")
pos_weight = (neg_lab + 1.0) / (pos_lab + 1.0)
pos_weight_tf = tf.constant(pos_weight.astype("float32"))


I0000 00:00:1763464078.682624  156871 gpu_device.cc:2019] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 7220 MB memory:  -> device: 0, name: NVIDIA GeForce RTX 3080, pci bus id: 0000:06:00.0, compute capability: 8.6


In [10]:
def weighted_bce(pos_w):
    def loss(y_true, y_pred):
        e = tf.keras.backend.epsilon()
        y_pred = tf.clip_by_value(y_pred, e, 1.0 - e)
        term_pos = -pos_w * y_true * tf.math.log(y_pred)
        term_neg = -(1.0 - y_true) * tf.math.log(1.0 - y_pred)
        return tf.reduce_mean(term_pos + term_neg)
    return loss


In [11]:
# ------------------------------------------------------------
# 8) Модель
# ------------------------------------------------------------
n_in = X_tr.shape[1]
n_out = Y_tr.shape[1]

inp = layers.Input(shape=(n_in,))
x = layers.Dense(256, activation="relu", kernel_initializer="he_normal")(inp)
x = layers.Dropout(0.3)(x)
x = layers.Dense(128, activation="relu", kernel_initializer="he_normal")(x)
x = layers.Dropout(0.3)(x)
out = layers.Dense(
    n_out,
    activation="sigmoid",
    bias_initializer=initializers.Constant(out_bias)
)(x)


In [12]:
model = models.Model(inp, out, name="MLP_last_week_posw")

model.compile(
    optimizer=optimizers.Adam(1e-3),
    loss=weighted_bce(pos_weight_tf)
)


In [13]:
early = callbacks.EarlyStopping(
    monitor="val_loss",
    patience=10,
    restore_best_weights=True
)
reduce = callbacks.ReduceLROnPlateau(
    monitor="val_loss",
    factor=0.5,
    patience=5,
    min_lr=1e-5
)


In [14]:
history = model.fit(
    X_tr, Y_tr,
    validation_data=(X_val, Y_val),
    epochs=80,
    batch_size=128,
    callbacks=[early, reduce],
    verbose=1
)


Epoch 1/80


I0000 00:00:1763464115.266642  157357 service.cc:152] XLA service 0x7ff5700078f0 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
I0000 00:00:1763464115.266656  157357 service.cc:160]   StreamExecutor device (0): NVIDIA GeForce RTX 3080, Compute Capability 8.6
2025-11-18 14:08:35.282084: I tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.cc:269] disabling MLIR crash reproducer, set env var `MLIR_CRASH_REPRODUCER_DIRECTORY` to enable.
I0000 00:00:1763464115.364216  157357 cuda_dnn.cc:529] Loaded cuDNN version 90300


[1m 1/10[0m [32m━━[0m[37m━━━━━━━━━━━━━━━━━━[0m [1m10s[0m 1s/step - loss: 2.1101

I0000 00:00:1763464115.918707  157357 device_compiler.h:188] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.


[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 70ms/step - loss: 2.8062








[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 355ms/step - loss: 2.8001 - val_loss: 2.4261 - learning_rate: 0.0010
Epoch 2/80
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step - loss: 1.8899 - val_loss: 1.8246 - learning_rate: 0.0010
Epoch 3/80
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - loss: 1.5076 - val_loss: 1.5334 - learning_rate: 0.0010
Epoch 4/80
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - loss: 1.4503 - val_loss: 1.3740 - learning_rate: 0.0010
Epoch 5/80
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step - loss: 1.4095 - val_loss: 1.2771 - learning_rate: 0.0010
Epoch 6/80
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step - loss: 1.3080 - val_loss: 1.2093 - learning_rate: 0.0010
Epoch 7/80
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - loss: 1.2756 - val_loss: 1.2114 - learning_rate: 0.0010
Epoch 8/80
[1m

In [15]:
# ------------------------------------------------------------
# 9) Прогнозы вероятностей
# ------------------------------------------------------------
P_tr = model.predict(X_tr, batch_size=256, verbose=0)
P_val = model.predict(X_val, batch_size=256, verbose=0)
P_te = model.predict(X_tst_sc, batch_size=256, verbose=0)
















In [16]:
# ------------------------------------------------------------
# 10) Подбор порогов по F1 на валидации
# ------------------------------------------------------------
from sklearn.metrics import fbeta_score

def best_label_thresholds(Y_true, P, beta=1.0):
    grid = np.linspace(0.05, 0.8, 32)
    T = []
    for j in range(P.shape[1]):
        yj, pj = Y_true[:, j], P[:, j]
        if yj.sum() == 0:
            T.append(0.5); continue
        best_t, best_s = 0.5, -1.0
        for t in grid:
            pred = (pj >= t).astype(int)
            s = fbeta_score(yj, pred, beta=beta, average="binary", zero_division=0)
            if s > best_s:
                best_t, best_s = t, s
        T.append(float(best_t))
    return np.array(T, dtype="float32")


In [17]:
def best_global_threshold_fbeta(Y_true, P, beta=1.0):
    grid = np.linspace(0.05, 0.8, 32)
    scored = [
        (t, fbeta_score(Y_true, (P >= t).astype(int), beta=beta,
                        average="micro", zero_division=0))
        for t in grid
    ]
    return max(scored, key=lambda x: x[1])


In [18]:
thr_vec = best_label_thresholds(Y_val, P_val, beta=1.0)
t_glob, s_glob = best_global_threshold_fbeta(Y_val, P_val, beta=1.0)

thr_min = 0.2
thr_vec = np.maximum(thr_vec, thr_min)
print("Глобальный порог (micro-F1):", t_glob, "score:", s_glob)
print("Пер-меточные пороги:", dict(zip(labels_kept, np.round(thr_vec, 3))))


Глобальный порог (micro-F1): 0.34032258064516124 score: 0.26531408505657433
Пер-меточные пороги: {'Alcohol': 0.292, 'Amphet': 0.776, 'Amyl': 0.268, 'Benzos': 0.582, 'Caff': 0.461, 'Cannabis': 0.51, 'Choc': 0.2, 'Coke': 0.8, 'Ecstasy': 0.679, 'Heroin': 0.534, 'Ketamine': 0.8, 'LSD': 0.703, 'Legalh': 0.558, 'Meth': 0.703, 'Mushrooms': 0.8, 'Nicotine': 0.461, 'VSA': 0.34}


In [20]:
def decode_with_fallback(P, thr_vec, t_glob):
    pred = (P >= thr_vec).astype(int)
    row_max = P.max(axis=1)
    row_arg = P.argmax(axis=1)
    empties = np.where(pred.sum(axis=1) == 0)[0]
    use = empties[row_max[empties] >= max(t_glob, 0.25)]
    if use.size:
        pred[use, row_arg[use]] = 1
    return pred

Yp_tr = decode_with_fallback(P_tr, thr_vec, t_glob)
Yp_te = decode_with_fallback(P_te, thr_vec, t_glob)



In [21]:
# ------------------------------------------------------------
# 11) Оценка
# ------------------------------------------------------------
def eval_multilabel(name, Y_true, Y_pred, labels):
    subset_acc = accuracy_score(Y_true, Y_pred)
    micro_p = precision_score(Y_true, Y_pred, average="micro", zero_division=0)
    micro_r = recall_score(Y_true, Y_pred, average="micro", zero_division=0)
    micro_f1 = f1_score(Y_true, Y_pred, average="micro", zero_division=0)
    macro_p = precision_score(Y_true, Y_pred, average="macro", zero_division=0)
    macro_r = recall_score(Y_true, Y_pred, average="macro", zero_division=0)
    macro_f1 = f1_score(Y_true, Y_pred, average="macro", zero_division=0)
    jacc = jaccard_score(Y_true, Y_pred, average="samples", zero_division=0)

    print(f"\n=== {name} — Общие метрики ===")
    print(f"Subset Accuracy: {subset_acc:.4f}")
    print(f"Micro  P/R/F1 : {micro_p:.4f} / {micro_r:.4f} / {micro_f1:.4f}")
    print(f"Macro  P/R/F1 : {macro_p:.4f} / {macro_r:.4f} / {macro_f1:.4f}")
    print(f"Jaccard (samples): {jacc:.4f}")

    print(f"\n=== {name} — Отчёт по меткам ===")
    print(classification_report(
        Y_true, Y_pred,
        target_names=labels,
        digits=4,
        zero_division=0
    ))


In [22]:
eval_multilabel("TRAIN — last_week_MLP", Y_tr, Yp_tr, labels_kept)



=== TRAIN — last_week_MLP — Общие метрики ===
Subset Accuracy: 0.0531
Micro  P/R/F1 : 0.2345 / 0.7622 / 0.3587
Macro  P/R/F1 : 0.1526 / 0.6023 / 0.2303
Jaccard (samples): 0.2708

=== TRAIN — last_week_MLP — Отчёт по меткам ===
              precision    recall  f1-score   support

     Alcohol     0.4002    0.9490    0.5630       471
      Amphet     0.1167    0.2979    0.1677        47
        Amyl     0.0306    1.0000    0.0594         9
      Benzos     0.1391    0.5818    0.2246        55
        Caff     0.2167    0.4770    0.2980       174
    Cannabis     0.1924    0.5641    0.2870       117
        Choc     0.3616    0.9977    0.5308       436
        Coke     0.1263    0.4000    0.1920        30
     Ecstasy     0.0854    0.3889    0.1400        36
      Heroin     0.0503    0.6923    0.0938        13
    Ketamine     0.1250    0.3750    0.1875        24
         LSD     0.1613    0.7143    0.2632        35
      Legalh     0.1165    0.5714    0.1935        42
        Meth   

In [23]:
eval_multilabel("TEST  — last_week_MLP", Y_tst, Yp_te, labels_kept)



=== TEST  — last_week_MLP — Общие метрики ===
Subset Accuracy: 0.0477
Micro  P/R/F1 : 0.2242 / 0.7423 / 0.3443
Macro  P/R/F1 : 0.1155 / 0.4877 / 0.1737
Jaccard (samples): 0.2677

=== TEST  — last_week_MLP — Отчёт по меткам ===
              precision    recall  f1-score   support

     Alcohol     0.4206    0.9679    0.5864       156
      Amphet     0.0500    0.2222    0.0816         9
        Amyl     0.0106    0.5000    0.0208         2
      Benzos     0.1077    0.4118    0.1707        17
        Caff     0.1509    0.3019    0.2013        53
    Cannabis     0.1897    0.6286    0.2914        35
        Choc     0.3750    1.0000    0.5455       141
        Coke     0.0741    0.2857    0.1176         7
     Ecstasy     0.1273    0.3684    0.1892        19
      Heroin     0.0333    1.0000    0.0645         2
    Ketamine     0.0000    0.0000    0.0000         6
         LSD     0.0698    0.2143    0.1053        14
      Legalh     0.0857    0.4286    0.1429        14
        Meth   

### Результаты по лучшей моделе:

#### Опять общая точность модели плохая, но по относительно высокому recall (по сравнению с другими моделями, которые я делал) Мы чаще всего определяем хотя бы одно всещество правильно и по отношению precisino/recall мы видим, что модель больше склонна к ложному положительному результату чем отрицательному.