| question1 | question2 | is_duplicate | len_q1 | len_q2 | diff_len | len_char_q1 | len_char_q2 | len_word_q1 | len_word_q2 | common_words | fuzz_ratio | fuzz_partial_ratio | fuzz_partial_token_set_ratio | fuzz_partial_token_sort_ratio | fuzz_token_set_ratio | fuzz_token_sort_ratio | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | What is the step by step guide to invest in sh... | What is the step by step guide to invest in sh... | 0 | 66 | 57 | 9 | 20 | 20 | 14 | 12 | 10 | 93 | 98 | 100 | 89 | 100 | 93 |
| 1 | What is the story of Kohinoor (Koh-i-Noor) Dia... | What would happen if the Indian government sto... | 0 | 51 | 88 | -37 | 21 | 29 | 8 | 13 | 4 | 65 | 73 | 100 | 75 | 86 | 63 |
| question1 | question2 | is_duplicate | len_q1 | len_q2 | diff_len | len_char_q1 | len_char_q2 | len_word_q1 | len_word_q2 | common_words | fuzz_ratio | fuzz_partial_ratio | fuzz_partial_token_set_ratio | fuzz_partial_token_sort_ratio | fuzz_token_set_ratio | fuzz_token_sort_ratio | wmd | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | What is the step by step guide to invest in sh... | What is the step by step guide to invest in sh... | 0 | 66 | 57 | 9 | 20 | 20 | 14 | 12 | 10 | 93 | 98 | 100 | 89 | 100 | 93 | 0.564615 |
| 1 | What is the story of Kohinoor (Koh-i-Noor) Dia... | What would happen if the Indian government sto... | 0 | 51 | 88 | -37 | 21 | 29 | 8 | 13 | 4 | 65 | 73 | 100 | 75 | 86 | 63 | 3.772346 |
| question1 | question2 | is_duplicate | len_q1 | len_q2 | diff_len | len_char_q1 | len_char_q2 | len_word_q1 | len_word_q2 | common_words | fuzz_ratio | fuzz_partial_ratio | fuzz_partial_token_set_ratio | fuzz_partial_token_sort_ratio | fuzz_token_set_ratio | fuzz_token_sort_ratio | wmd | norm_wmd | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | What is the step by step guide to invest in sh... | What is the step by step guide to invest in sh... | 0 | 66 | 57 | 9 | 20 | 20 | 14 | 12 | 10 | 93 | 98 | 100 | 89 | 100 | 93 | 0.564615 | 0.217555 |
| 1 | What is the story of Kohinoor (Koh-i-Noor) Dia... | What would happen if the Indian government sto... | 0 | 51 | 88 | -37 | 21 | 29 | 8 | 13 | 4 | 65 | 73 | 100 | 75 | 86 | 63 | 3.772346 | 1.368796 |
Failed to display Jupyter Widget of type HBox.
\n", " If you're reading this message in the Jupyter Notebook or JupyterLab Notebook, it may mean\n", " that the widgets JavaScript is still loading. If this message persists, it\n", " likely means that the widgets JavaScript library is either not installed or\n", " not enabled. See the Jupyter\n", " Widgets Documentation for setup instructions.\n", "
\n", "\n", " If you're reading this message in another frontend (for example, a static\n", " rendering on GitHub or NBViewer),\n", " it may mean that your frontend doesn't currently support widgets.\n", "
\n" ], "text/plain": [ "HBox(children=(IntProgress(value=0, max=404287), HTML(value='')))" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stderr", "output_type": "stream", "text": [ "C:\\Users\\SusanLi\\AppData\\Local\\Continuum\\anaconda3\\lib\\site-packages\\ipykernel_launcher.py:14: RuntimeWarning: invalid value encountered in double_scalars\n", " \n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "\n" ] }, { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "bf1d0b01886541909cee3c550bb1ea3e", "version_major": 2, "version_minor": 0 }, "text/html": [ "Failed to display Jupyter Widget of type HBox.
\n", " If you're reading this message in the Jupyter Notebook or JupyterLab Notebook, it may mean\n", " that the widgets JavaScript is still loading. If this message persists, it\n", " likely means that the widgets JavaScript library is either not installed or\n", " not enabled. See the Jupyter\n", " Widgets Documentation for setup instructions.\n", "
\n", "\n", " If you're reading this message in another frontend (for example, a static\n", " rendering on GitHub or NBViewer),\n", " it may mean that your frontend doesn't currently support widgets.\n", "
\n" ], "text/plain": [ "HBox(children=(IntProgress(value=0, max=404287), HTML(value='')))" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "\n" ] } ], "source": [ "question1_vectors = np.zeros((df.shape[0], 300))\n", "\n", "for i, q in enumerate(tqdm_notebook(df.question1.values)):\n", " question1_vectors[i, :] = sent2vec(q)\n", " \n", "question2_vectors = np.zeros((df.shape[0], 300))\n", "for i, q in enumerate(tqdm_notebook(df.question2.values)):\n", " question2_vectors[i, :] = sent2vec(q)" ] }, { "cell_type": "code", "execution_count": 17, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "C:\\Users\\SusanLi\\AppData\\Local\\Continuum\\anaconda3\\lib\\site-packages\\scipy\\spatial\\distance.py:698: RuntimeWarning: invalid value encountered in double_scalars\n", " dist = 1.0 - uv / np.sqrt(uu * vv)\n", "C:\\Users\\SusanLi\\AppData\\Local\\Continuum\\anaconda3\\lib\\site-packages\\scipy\\spatial\\distance.py:853: RuntimeWarning: invalid value encountered in double_scalars\n", " dist = np.double(unequal_nonzero.sum()) / np.double(nonzero.sum())\n", "C:\\Users\\SusanLi\\AppData\\Local\\Continuum\\anaconda3\\lib\\site-packages\\scipy\\spatial\\distance.py:1138: RuntimeWarning: invalid value encountered in double_scalars\n", " return l1_diff.sum() / l1_sum.sum()\n" ] } ], "source": [ "df['cosine_distance'] = [cosine(x, y) for (x, y) in zip(np.nan_to_num(question1_vectors), np.nan_to_num(question2_vectors))]\n", "df['cityblock_distance'] = [cityblock(x, y) for (x, y) in zip(np.nan_to_num(question1_vectors), np.nan_to_num(question2_vectors))]\n", "df['jaccard_distance'] = [jaccard(x, y) for (x, y) in zip(np.nan_to_num(question1_vectors), np.nan_to_num(question2_vectors))]\n", "df['canberra_distance'] = [canberra(x, y) for (x, y) in zip(np.nan_to_num(question1_vectors), np.nan_to_num(question2_vectors))]\n", "df['euclidean_distance'] = [euclidean(x, y) for (x, y) in zip(np.nan_to_num(question1_vectors), np.nan_to_num(question2_vectors))]\n", "df['minkowski_distance'] = [minkowski(x, y, 3) for (x, y) in zip(np.nan_to_num(question1_vectors), np.nan_to_num(question2_vectors))]\n", "df['braycurtis_distance'] = [braycurtis(x, y) for (x, y) in zip(np.nan_to_num(question1_vectors), np.nan_to_num(question2_vectors))]\n", "df['skew_q1vec'] = [skew(x) for x in np.nan_to_num(question1_vectors)]\n", "df['skew_q2vec'] = [skew(x) for x in np.nan_to_num(question2_vectors)]\n", "df['kur_q1vec'] = [kurtosis(x) for x in np.nan_to_num(question1_vectors)]\n", "df['kur_q2vec'] = [kurtosis(x) for x in np.nan_to_num(question2_vectors)]" ] }, { "cell_type": "code", "execution_count": 18, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "0 255024\n", "1 149263\n", "Name: is_duplicate, dtype: int64" ] }, "execution_count": 18, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df['is_duplicate'].value_counts()" ] }, { "cell_type": "code", "execution_count": 19, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "question1 0\n", "question2 0\n", "is_duplicate 0\n", "len_q1 0\n", "len_q2 0\n", "diff_len 0\n", "len_char_q1 0\n", "len_char_q2 0\n", "len_word_q1 0\n", "len_word_q2 0\n", "common_words 0\n", "fuzz_ratio 0\n", "fuzz_partial_ratio 0\n", "fuzz_partial_token_set_ratio 0\n", "fuzz_partial_token_sort_ratio 0\n", "fuzz_token_set_ratio 0\n", "fuzz_token_sort_ratio 0\n", "wmd 0\n", "norm_wmd 0\n", "cosine_distance 1775\n", "cityblock_distance 0\n", "jaccard_distance 522\n", "canberra_distance 0\n", "euclidean_distance 0\n", "minkowski_distance 0\n", "braycurtis_distance 522\n", "skew_q1vec 0\n", "skew_q2vec 0\n", "kur_q1vec 0\n", "kur_q2vec 0\n", "dtype: int64" ] }, "execution_count": 19, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.isnull().sum()" ] }, { "cell_type": "code", "execution_count": 20, "metadata": {}, "outputs": [], "source": [ "df.drop(['question1', 'question2'], axis=1, inplace=True)\n", "df = df[pd.notnull(df['cosine_distance'])]\n", "df = df[pd.notnull(df['jaccard_distance'])]" ] }, { "cell_type": "code", "execution_count": 22, "metadata": {}, "outputs": [], "source": [ "from sklearn.model_selection import train_test_split\n", "from sklearn.metrics import classification_report\n", "from sklearn.metrics import confusion_matrix \n", "from sklearn.metrics import accuracy_score" ] }, { "cell_type": "code", "execution_count": 23, "metadata": {}, "outputs": [], "source": [ "X = df.loc[:, df.columns != 'is_duplicate']\n", "y = df.loc[:, df.columns == 'is_duplicate']\n", "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)" ] }, { "cell_type": "code", "execution_count": 24, "metadata": { "scrolled": true }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "[[60757 15121]\n", " [12054 32822]]\n", "Accuracy 0.7749556950494394\n", " precision recall f1-score support\n", "\n", " 0 0.83 0.80 0.82 75878\n", " 1 0.68 0.73 0.71 44876\n", "\n", " micro avg 0.77 0.77 0.77 120754\n", " macro avg 0.76 0.77 0.76 120754\n", "weighted avg 0.78 0.77 0.78 120754\n", "\n" ] } ], "source": [ "import xgboost as xgb\n", "\n", "model = xgb.XGBClassifier(max_depth=50, n_estimators=80, learning_rate=0.1, colsample_bytree=.7, gamma=0, reg_alpha=4, objective='binary:logistic', eta=0.3, silent=1, subsample=0.8).fit(X_train, y_train.values.ravel()) \n", "prediction = model.predict(X_test)\n", "cm = confusion_matrix(y_test, prediction) \n", "print(cm) \n", "print('Accuracy', accuracy_score(y_test, prediction))\n", "print(classification_report(y_test, prediction))" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.6.4" } }, "nbformat": 4, "nbformat_minor": 2 }