{
 "cells": [
  {
   "cell_type": "markdown",
   "id": "af3e8ee5",
   "metadata": {},
   "source": [
    "# Diabetes Classification + PCA"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "e20e1efb",
   "metadata": {},
   "outputs": [],
   "source": [
    "\n",
    "import warnings\n",
    "warnings.filterwarnings(\"ignore\")\n",
    "\n",
    "import numpy as np\n",
    "import pandas as pd\n",
    "import matplotlib.pyplot as plt\n",
    "\n",
    "from sklearn.model_selection import train_test_split\n",
    "from sklearn.preprocessing import StandardScaler\n",
    "from sklearn.decomposition import PCA\n",
    "from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score\n",
    "\n",
    "import statsmodels.api as sm\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "4e491137",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Размер: (768, 9)\n",
      "- Number of times pregnant\n",
      "- Plasma glucose concentration a 2 hours in an oral glucose tolerance test\n",
      "- Diastolic blood pressure (mm Hg)\n",
      "- Triceps skin fold thickness (mm)\n",
      "- 2-Hour serum insulin (mu U/ml)\n",
      "- Body mass index (weight in kg/(height in m)^2)\n",
      "- Diabetes pedigree function\n",
      "- Age (years)\n",
      "- Class variable (0 or 1)\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Number of times pregnant</th>\n",
       "      <th>Plasma glucose concentration a 2 hours in an oral glucose tolerance test</th>\n",
       "      <th>Diastolic blood pressure (mm Hg)</th>\n",
       "      <th>Triceps skin fold thickness (mm)</th>\n",
       "      <th>2-Hour serum insulin (mu U/ml)</th>\n",
       "      <th>Body mass index (weight in kg/(height in m)^2)</th>\n",
       "      <th>Diabetes pedigree function</th>\n",
       "      <th>Age (years)</th>\n",
       "      <th>Class variable (0 or 1)</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>6</td>\n",
       "      <td>148</td>\n",
       "      <td>72</td>\n",
       "      <td>35</td>\n",
       "      <td>0</td>\n",
       "      <td>33.6</td>\n",
       "      <td>0.627</td>\n",
       "      <td>50</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>1</td>\n",
       "      <td>85</td>\n",
       "      <td>66</td>\n",
       "      <td>29</td>\n",
       "      <td>0</td>\n",
       "      <td>26.6</td>\n",
       "      <td>0.351</td>\n",
       "      <td>31</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>8</td>\n",
       "      <td>183</td>\n",
       "      <td>64</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>23.3</td>\n",
       "      <td>0.672</td>\n",
       "      <td>32</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>1</td>\n",
       "      <td>89</td>\n",
       "      <td>66</td>\n",
       "      <td>23</td>\n",
       "      <td>94</td>\n",
       "      <td>28.1</td>\n",
       "      <td>0.167</td>\n",
       "      <td>21</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>0</td>\n",
       "      <td>137</td>\n",
       "      <td>40</td>\n",
       "      <td>35</td>\n",
       "      <td>168</td>\n",
       "      <td>43.1</td>\n",
       "      <td>2.288</td>\n",
       "      <td>33</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   Number of times pregnant  \\\n",
       "0                         6   \n",
       "1                         1   \n",
       "2                         8   \n",
       "3                         1   \n",
       "4                         0   \n",
       "\n",
       "   Plasma glucose concentration a 2 hours in an oral glucose tolerance test  \\\n",
       "0                                                148                          \n",
       "1                                                 85                          \n",
       "2                                                183                          \n",
       "3                                                 89                          \n",
       "4                                                137                          \n",
       "\n",
       "   Diastolic blood pressure (mm Hg)  Triceps skin fold thickness (mm)  \\\n",
       "0                                72                                35   \n",
       "1                                66                                29   \n",
       "2                                64                                 0   \n",
       "3                                66                                23   \n",
       "4                                40                                35   \n",
       "\n",
       "   2-Hour serum insulin (mu U/ml)  \\\n",
       "0                               0   \n",
       "1                               0   \n",
       "2                               0   \n",
       "3                              94   \n",
       "4                             168   \n",
       "\n",
       "   Body mass index (weight in kg/(height in m)^2)  Diabetes pedigree function  \\\n",
       "0                                            33.6                       0.627   \n",
       "1                                            26.6                       0.351   \n",
       "2                                            23.3                       0.672   \n",
       "3                                            28.1                       0.167   \n",
       "4                                            43.1                       2.288   \n",
       "\n",
       "   Age (years)  Class variable (0 or 1)  \n",
       "0           50                        1  \n",
       "1           31                        0  \n",
       "2           32                        1  \n",
       "3           21                        0  \n",
       "4           33                        1  "
      ]
     },
     "execution_count": null,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "\n",
    "df = pd.read_csv(\"Diabetes Binary Classification.csv\")\n",
    "df.columns = df.columns.str.strip()\n",
    "\n",
    "print(\"Размер:\", df.shape)\n",
    "for c in df.columns:\n",
    "    print(\"-\", c)\n",
    "df.head()\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "eac0bf0b",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Train: (614, 8)\n",
      "Test: (154, 8)\n"
     ]
    }
   ],
   "source": [
    "\n",
    "target = \"Class variable (0 or 1)\"\n",
    "\n",
    "numeric_cols = df.select_dtypes(include=[\"number\"]).columns.tolist()\n",
    "feature_cols = [c for c in numeric_cols if c != target]\n",
    "\n",
    "data = df[feature_cols + [target]].dropna().copy()\n",
    "\n",
    "X = data[feature_cols]\n",
    "y = data[target]\n",
    "\n",
    "X_train, X_test, y_train, y_test = train_test_split(\n",
    "    X, y, test_size=0.2, random_state=42, stratify=y\n",
    ")\n",
    "\n",
    "print(\"Train:\", X_train.shape)\n",
    "print(\"Test:\", X_test.shape)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "74a40b66",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "                              Logit Regression Results                             \n",
      "===================================================================================\n",
      "Dep. Variable:     Class variable (0 or 1)   No. Observations:                  614\n",
      "Model:                               Logit   Df Residuals:                      605\n",
      "Method:                                MLE   Df Model:                            8\n",
      "Date:                     Sat, 14 Mar 2026   Pseudo R-squ.:                  0.2789\n",
      "Time:                             13:18:53   Log-Likelihood:                -286.25\n",
      "converged:                            True   LL-Null:                       -396.97\n",
      "Covariance Type:                 nonrobust   LLR p-value:                 1.909e-43\n",
      "============================================================================================================================================\n",
      "                                                                               coef    std err          z      P>|z|      [0.025      0.975]\n",
      "--------------------------------------------------------------------------------------------------------------------------------------------\n",
      "const                                                                       -0.8807      0.109     -8.064      0.000      -1.095      -0.667\n",
      "Number of times pregnant                                                     0.3805      0.122      3.129      0.002       0.142       0.619\n",
      "Plasma glucose concentration a 2 hours in an oral glucose tolerance test     1.1661      0.135      8.609      0.000       0.901       1.432\n",
      "Diastolic blood pressure (mm Hg)                                            -0.2048      0.110     -1.855      0.064      -0.421       0.012\n",
      "Triceps skin fold thickness (mm)                                             0.0695      0.126      0.551      0.581      -0.178       0.317\n",
      "2-Hour serum insulin (mu U/ml)                                              -0.1371      0.124     -1.107      0.268      -0.380       0.106\n",
      "Body mass index (weight in kg/(height in m)^2)                               0.7274      0.135      5.405      0.000       0.464       0.991\n",
      "Diabetes pedigree function                                                   0.2593      0.111      2.335      0.020       0.042       0.477\n",
      "Age (years)                                                                  0.1819      0.121      1.499      0.134      -0.056       0.420\n",
      "============================================================================================================================================\n"
     ]
    }
   ],
   "source": [
    "\n",
    "scaler = StandardScaler()\n",
    "X_train_scaled = scaler.fit_transform(X_train)\n",
    "X_test_scaled = scaler.transform(X_test)\n",
    "\n",
    "X_train_sm = sm.add_constant(\n",
    "    pd.DataFrame(X_train_scaled, columns=feature_cols, index=y_train.index),\n",
    "    has_constant=\"add\"\n",
    ")\n",
    "X_test_sm = sm.add_constant(\n",
    "    pd.DataFrame(X_test_scaled, columns=feature_cols, index=y_test.index),\n",
    "    has_constant=\"add\"\n",
    ")\n",
    "\n",
    "model = sm.Logit(y_train, X_train_sm).fit(disp=False)\n",
    "print(model.summary())\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "a0143bf4",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Accuracy_test: 0.7142857142857143\n",
      "Precision_test: 0.6086956521739131\n",
      "Recall_test: 0.5185185185185185\n",
      "F1_test: 0.56\n",
      "ROC_AUC_test: 0.8240740740740741\n",
      "LLR_pvalue: 1.9087920276004937e-43\n",
      "Pseudo_R2: 0.2789142159398924\n"
     ]
    }
   ],
   "source": [
    "\n",
    "train_proba = model.predict(X_train_sm)\n",
    "test_proba = model.predict(X_test_sm)\n",
    "\n",
    "train_pred = (train_proba >= 0.5).astype(int)\n",
    "test_pred = (test_proba >= 0.5).astype(int)\n",
    "\n",
    "print(\"Accuracy_test:\", accuracy_score(y_test, test_pred))\n",
    "print(\"Precision_test:\", precision_score(y_test, test_pred, zero_division=0))\n",
    "print(\"Recall_test:\", recall_score(y_test, test_pred, zero_division=0))\n",
    "print(\"F1_test:\", f1_score(y_test, test_pred, zero_division=0))\n",
    "print(\"ROC_AUC_test:\", roc_auc_score(y_test, test_proba))\n",
    "print(\"LLR_pvalue:\", model.llr_pvalue)\n",
    "print(\"Pseudo_R2:\", model.prsquared)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "id": "3bed319c",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Explained variance 2PC: 0.47819876073856576\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>PC1</th>\n",
       "      <th>PC2</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>Number of times pregnant</th>\n",
       "      <td>0.128432</td>\n",
       "      <td>0.593786</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Plasma glucose concentration a 2 hours in an oral glucose tolerance test</th>\n",
       "      <td>0.393083</td>\n",
       "      <td>0.174029</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Diastolic blood pressure (mm Hg)</th>\n",
       "      <td>0.360003</td>\n",
       "      <td>0.183892</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Triceps skin fold thickness (mm)</th>\n",
       "      <td>0.439824</td>\n",
       "      <td>-0.331965</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2-Hour serum insulin (mu U/ml)</th>\n",
       "      <td>0.435026</td>\n",
       "      <td>-0.250781</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Body mass index (weight in kg/(height in m)^2)</th>\n",
       "      <td>0.451941</td>\n",
       "      <td>-0.100960</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Diabetes pedigree function</th>\n",
       "      <td>0.270611</td>\n",
       "      <td>-0.122069</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Age (years)</th>\n",
       "      <td>0.198027</td>\n",
       "      <td>0.620589</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                                                         PC1       PC2\n",
       "Number of times pregnant                            0.128432  0.593786\n",
       "Plasma glucose concentration a 2 hours in an or...  0.393083  0.174029\n",
       "Diastolic blood pressure (mm Hg)                    0.360003  0.183892\n",
       "Triceps skin fold thickness (mm)                    0.439824 -0.331965\n",
       "2-Hour serum insulin (mu U/ml)                      0.435026 -0.250781\n",
       "Body mass index (weight in kg/(height in m)^2)      0.451941 -0.100960\n",
       "Diabetes pedigree function                          0.270611 -0.122069\n",
       "Age (years)                                         0.198027  0.620589"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "\n",
    "X_scaled_all = StandardScaler().fit_transform(X)\n",
    "\n",
    "pca = PCA(n_components=2)\n",
    "X_pca = pca.fit_transform(X_scaled_all)\n",
    "\n",
    "print(\"Explained variance 2PC:\", pca.explained_variance_ratio_.sum())\n",
    "\n",
    "loadings = pd.DataFrame(\n",
    "    pca.components_.T,\n",
    "    index=feature_cols,\n",
    "    columns=[\"PC1\", \"PC2\"]\n",
    ")\n",
    "display(loadings)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "id": "594d79b5",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "                              Logit Regression Results                             \n",
      "===================================================================================\n",
      "Dep. Variable:     Class variable (0 or 1)   No. Observations:                  614\n",
      "Model:                               Logit   Df Residuals:                      611\n",
      "Method:                                MLE   Df Model:                            2\n",
      "Date:                     Sat, 14 Mar 2026   Pseudo R-squ.:                  0.1749\n",
      "Time:                             13:18:53   Log-Likelihood:                -327.53\n",
      "converged:                            True   LL-Null:                       -396.97\n",
      "Covariance Type:                 nonrobust   LLR p-value:                 6.954e-31\n",
      "==============================================================================\n",
      "                 coef    std err          z      P>|z|      [0.025      0.975]\n",
      "------------------------------------------------------------------------------\n",
      "const         -0.7901      0.099     -7.979      0.000      -0.984      -0.596\n",
      "PC1            0.7629      0.081      9.418      0.000       0.604       0.922\n",
      "PC2            0.3654      0.071      5.155      0.000       0.226       0.504\n",
      "==============================================================================\n"
     ]
    }
   ],
   "source": [
    "\n",
    "pca_df = pd.DataFrame(X_pca, columns=[\"PC1\", \"PC2\"], index=y.index)\n",
    "\n",
    "Xp_train, Xp_test, yp_train, yp_test = train_test_split(\n",
    "    pca_df, y, test_size=0.2, random_state=42, stratify=y\n",
    ")\n",
    "\n",
    "Xp_train_sm = sm.add_constant(Xp_train, has_constant=\"add\")\n",
    "Xp_test_sm = sm.add_constant(Xp_test, has_constant=\"add\")\n",
    "\n",
    "model_pca = sm.Logit(yp_train, Xp_train_sm).fit(disp=False)\n",
    "print(model_pca.summary())\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "id": "b2b523ba",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Accuracy_test_pca: 0.7077922077922078\n",
      "Precision_test_pca: 0.6\n",
      "Recall_test_pca: 0.5\n",
      "F1_test_pca: 0.5454545454545454\n",
      "ROC_AUC_test_pca: 0.7088888888888889\n",
      "LLR_pvalue_pca: 6.95388111339125e-31\n",
      "Pseudo_R2_pca: 0.17492619879836246\n"
     ]
    }
   ],
   "source": [
    "\n",
    "proba_pca = model_pca.predict(Xp_test_sm)\n",
    "pred_pca = (proba_pca >= 0.5).astype(int)\n",
    "\n",
    "print(\"Accuracy_test_pca:\", accuracy_score(yp_test, pred_pca))\n",
    "print(\"Precision_test_pca:\", precision_score(yp_test, pred_pca, zero_division=0))\n",
    "print(\"Recall_test_pca:\", recall_score(yp_test, pred_pca, zero_division=0))\n",
    "print(\"F1_test_pca:\", f1_score(yp_test, pred_pca, zero_division=0))\n",
    "print(\"ROC_AUC_test_pca:\", roc_auc_score(yp_test, proba_pca))\n",
    "print(\"LLR_pvalue_pca:\", model_pca.llr_pvalue)\n",
    "print(\"Pseudo_R2_pca:\", model_pca.prsquared)\n"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "a97c1b1e",
   "metadata": {},
   "source": [
    "## Итог\n",
    "\n",
    "\n",
    "Зависимая переменная — Class variable (0 or 1).\n",
    "\n",
    "Классификация по исходным признакам\n",
    "\n",
    "Качество модели:\n",
    "\n",
    "Accuracy = 0.714\n",
    "\n",
    "F1 = 0.56\n",
    "\n",
    "ROC-AUC = 0.824\n",
    "\n",
    "Pseudo R² = 0.279\n",
    "\n",
    "LLR p-value < 0.001\n",
    "\n",
    "Модель статистически значима. Наиболее значимые признаки:\n",
    "Plasma glucose, BMI, Number of pregnancies, Diabetes pedigree function.\n",
    "\n",
    "Снижение размерности (PCA)\n",
    "\n",
    "Две компоненты объясняют 47.8% дисперсии.\n",
    "\n",
    "Классификация по PC1 и PC2:\n",
    "\n",
    "Accuracy = 0.708\n",
    "\n",
    "F1 = 0.545\n",
    "\n",
    "ROC-AUC = 0.709\n",
    "\n",
    "Pseudo R² = 0.175\n",
    "\n",
    "Модель также значима, но качество ниже.\n",
    "\n",
    "Вывод\n",
    "\n",
    "Классификатор по исходным признакам лучше, так как имеет более высокий ROC-AUC и Pseudo R².\n",
    "Снижение размерности до двух компонент упрощает модель, но приводит к потере части информации и небольшому снижению точности. "
   ]
  }
 ],
 "metadata": {
  "language_info": {
   "name": "python"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
