X Tutup
{ "cells": [ { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "from scipy.sparse import csr_matrix\n", "import sparse_dot_topn.sparse_dot_topn as ct\n", "from sklearn.feature_extraction.text import TfidfVectorizer" ] }, { "cell_type": "code", "execution_count": 30, "metadata": {}, "outputs": [], "source": [ "df = pd.read_csv('data/Seattle_Hotels_Duplicates.csv', encoding=\"latin-1\")" ] }, { "cell_type": "code", "execution_count": 31, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
nameaddress
0Hilton Garden Inn Seattle Downtown1821 Boren Avenue, Seattle Washington 98101 USA
1Sheraton Grand Seattle1400 6th Avenue, Seattle, Washington 98101 USA
2Crowne Plaza Seattle Downtown1113 6th Ave, Seattle, WA 98101
\n", "
" ], "text/plain": [ " name \\\n", "0 Hilton Garden Inn Seattle Downtown \n", "1 Sheraton Grand Seattle \n", "2 Crowne Plaza Seattle Downtown \n", "\n", " address \n", "0 1821 Boren Avenue, Seattle Washington 98101 USA \n", "1 1400 6th Avenue, Seattle, Washington 98101 USA \n", "2 1113 6th Ave, Seattle, WA 98101 " ] }, "execution_count": 31, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.head(3)" ] }, { "cell_type": "code", "execution_count": 32, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "Roy Street Commons 2\n", "Comfort Inn & Suites 2\n", "Seattle Inn Northgate 2\n", "Hotel Seattle 2\n", "Ace Hotel Seattle 2\n", " ..\n", "Econo Lodge Renton-Bellevue 1\n", "Hotel Theodore 1\n", "La Quinta Inn & Suites Seattle Downtown 1\n", "Thompson Seattle 1\n", "Travelodge by Wyndham Seattle North of Downtown 1\n", "Name: name, Length: 155, dtype: int64" ] }, "execution_count": 32, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.name.value_counts()" ] }, { "cell_type": "code", "execution_count": 33, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
nameaddress
82Roy Street Commons621 12th Ave E, Seattle, WA 98102
90Roy Street Commons621 12th Avenue East, Seattle, Washington 98102
\n", "
" ], "text/plain": [ " name address\n", "82 Roy Street Commons 621 12th Ave E, Seattle, WA 98102\n", "90 Roy Street Commons 621 12th Avenue East, Seattle, Washington 98102" ] }, "execution_count": 33, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.loc[df['name'] == 'Roy Street Commons']" ] }, { "cell_type": "code", "execution_count": 35, "metadata": {}, "outputs": [], "source": [ "df['name_address'] = df['name'] + ' ' + df['address']\n", "name_address = df['name_address']\n", "vectorizer = TfidfVectorizer(\"char\", ngram_range=(1, 4), sublinear_tf=True)\n", "tf_idf_matrix = vectorizer.fit_transform(name_address)" ] }, { "cell_type": "code", "execution_count": 36, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "<168x3218 sparse matrix of type ''\n", "\twith 6002 stored elements in Compressed Sparse Row format>" ] }, "execution_count": 36, "metadata": {}, "output_type": "execute_result" } ], "source": [ "tf_idf_matrix" ] }, { "cell_type": "code", "execution_count": 37, "metadata": {}, "outputs": [], "source": [ "def awesome_cossim_top(A, B, ntop, lower_bound=0):\n", " \n", " A = A.tocsr()\n", " B = B.tocsr()\n", " M, _ = A.shape\n", " _, N = B.shape\n", " \n", " idx_dtype = np.int32\n", " \n", " nnz_max = M*ntop\n", " \n", " indptr = np.zeros(M+1, dtype=idx_dtype)\n", " indices = np.zeros(nnz_max, dtype=idx_dtype)\n", " data = np.zeros(nnz_max, dtype=A.dtype)\n", "\n", " ct.sparse_dot_topn(\n", " M, N, np.asarray(A.indptr, dtype=idx_dtype),\n", " np.asarray(A.indices, dtype=idx_dtype),\n", " A.data,\n", " np.asarray(B.indptr, dtype=idx_dtype),\n", " np.asarray(B.indices, dtype=idx_dtype),\n", " B.data,\n", " ntop,\n", " lower_bound,\n", " indptr, indices, data)\n", "\n", " return csr_matrix((data,indices,indptr),shape=(M,N))\n", "\n", "matches = awesome_cossim_top(tf_idf_matrix, tf_idf_matrix.transpose(), 5)" ] }, { "cell_type": "code", "execution_count": 41, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "<168x168 sparse matrix of type ''\n", "\twith 840 stored elements in Compressed Sparse Row format>" ] }, "execution_count": 41, "metadata": {}, "output_type": "execute_result" } ], "source": [ "matches" ] }, { "cell_type": "code", "execution_count": 51, "metadata": {}, "outputs": [], "source": [ "def get_matches_df(sparse_matrix, name_vector, top=840):\n", " non_zeros = sparse_matrix.nonzero()\n", " \n", " sparserows = non_zeros[0]\n", " sparsecols = non_zeros[1]\n", " \n", " if top:\n", " nr_matches = top\n", " else:\n", " nr_matches = sparsecols.size\n", " \n", " left_side = np.empty([nr_matches], dtype=object)\n", " right_side = np.empty([nr_matches], dtype=object)\n", " similairity = np.zeros(nr_matches)\n", " \n", " for index in range(0, nr_matches):\n", " left_side[index] = name_vector[sparserows[index]]\n", " right_side[index] = name_vector[sparsecols[index]]\n", " similairity[index] = sparse_matrix.data[index]\n", " \n", " return pd.DataFrame({'left_side': left_side,\n", " 'right_side': right_side,\n", " 'similarity': similairity})\n", "\n", "matches_df = get_matches_df(matches, name_address)" ] }, { "cell_type": "code", "execution_count": 45, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
left_sideright_sidesimilarity
826Pike's Place Lux Suites by Barsala 2nd Ave and...Pike's Place Lux Suites by Barsala 2rd Ave and...0.715406
831Pike's Place Lux Suites by Barsala 2rd Ave and...Pike's Place Lux Suites by Barsala 2nd Ave and...0.715406
206Holiday Inn Express & Suites Seattle-City Cent...Holiday Inn Express & Suites Seattle City Cent...0.712321
256Holiday Inn Express & Suites Seattle City Cent...Holiday Inn Express & Suites Seattle-City Cent...0.712321
181Travelodge Seattle by The Space Needle 200 6th...Travelodge Seattle by The Space Needle 200 6th...0.669974
211Travelodge Seattle by The Space Needle 200 6th...Travelodge Seattle by The Space Needle 200 6th...0.669974
791citizenM Seattle South Lake Union hotel 201 We...citizenM Seattle South Lake Union hotel 201 We...0.651961
836citizenM Seattle South Lake Union hotel 201 We...citizenM Seattle South Lake Union hotel 201 We...0.651961
586Quality Inn & Suites Seattle Center 618 John S...Quality Inn & Suites Seattle Center 618 John S...0.627400
551Quality Inn & Suites Seattle Center 618 John S...Quality Inn & Suites Seattle Center 618 John S...0.627400
46Hilton Garden Inn Seattle Downtown 1821 Boren ...Hilton Garden Inn Seattle Downtown 1821 Boren ...0.617412
1Hilton Garden Inn Seattle Downtown 1821 Boren ...Hilton Garden Inn Seattle Downtown 1821 Boren ...0.617412
781Hyatt Regency Lake Washington At SeattleS Sout...Hyatt Regency Lake Washington At SeattleS Sout...0.614371
746Hyatt Regency Lake Washington At SeattleS Sout...Hyatt Regency Lake Washington At SeattleS Sout...0.614371
346Home2 Suites by Hilton Seattle Airport 380 Upl...Home2 Suites by Hilton Seattle Airport 380 Upl...0.582787
341Home2 Suites by Hilton Seattle Airport 380 Upl...Home2 Suites by Hilton Seattle Airport 380 Upl...0.582787
561Renaissance Seattle Hotel 515 Madison Street, ...Renaissance Seattle Hotel 515 Madison St, Seat...0.531174
546Renaissance Seattle Hotel 515 Madison St, Seat...Renaissance Seattle Hotel 515 Madison Street, ...0.531174
391Days Inn by Wyndham Seattle North of Downtown ...Travelodge by Wyndham Seattle North of Downtow...0.528202
396Travelodge by Wyndham Seattle North of Downtow...Days Inn by Wyndham Seattle North of Downtown ...0.528202
401Comfort Inn & Suites 13700 Aurora Ave N, Seatt...Comfort Inn & Suites 13700 Aurora Avenue North...0.527696
441Comfort Inn & Suites 13700 Aurora Avenue North...Comfort Inn & Suites 13700 Aurora Ave N, Seatt...0.527696
451Roy Street Commons 621 12th Avenue East, Seatt...Roy Street Commons 621 12th Ave E, Seattle, WA...0.499497
411Roy Street Commons 621 12th Ave E, Seattle, WA...Roy Street Commons 621 12th Avenue East, Seatt...0.499497
11Crowne Plaza Seattle Downtown 1113 6th Ave, Se...Crowne Plaza Seattle 1113 6th Ave, Seattle, Wa...0.469867
61Crowne Plaza Seattle 1113 6th Ave, Seattle, Wa...Crowne Plaza Seattle Downtown 1113 6th Ave, Se...0.469867
671Watertown Hotel - A Staypineapple Hotel 4242 R...University Inn - A Staypineapple Hotel 4140 Ro...0.432854
706University Inn - A Staypineapple Hotel 4140 Ro...Watertown Hotel - A Staypineapple Hotel 4242 R...0.432854
621Executive Inn By The Space Needle 200 Taylor A...Travelodge Seattle by The Space Needle 200 6th...0.428319
182Travelodge Seattle by The Space Needle 200 6th...Executive Inn By The Space Needle 200 Taylor A...0.428319
\n", "
" ], "text/plain": [ " left_side \\\n", "826 Pike's Place Lux Suites by Barsala 2nd Ave and... \n", "831 Pike's Place Lux Suites by Barsala 2rd Ave and... \n", "206 Holiday Inn Express & Suites Seattle-City Cent... \n", "256 Holiday Inn Express & Suites Seattle City Cent... \n", "181 Travelodge Seattle by The Space Needle 200 6th... \n", "211 Travelodge Seattle by The Space Needle 200 6th... \n", "791 citizenM Seattle South Lake Union hotel 201 We... \n", "836 citizenM Seattle South Lake Union hotel 201 We... \n", "586 Quality Inn & Suites Seattle Center 618 John S... \n", "551 Quality Inn & Suites Seattle Center 618 John S... \n", "46 Hilton Garden Inn Seattle Downtown 1821 Boren ... \n", "1 Hilton Garden Inn Seattle Downtown 1821 Boren ... \n", "781 Hyatt Regency Lake Washington At SeattleS Sout... \n", "746 Hyatt Regency Lake Washington At SeattleS Sout... \n", "346 Home2 Suites by Hilton Seattle Airport 380 Upl... \n", "341 Home2 Suites by Hilton Seattle Airport 380 Upl... \n", "561 Renaissance Seattle Hotel 515 Madison Street, ... \n", "546 Renaissance Seattle Hotel 515 Madison St, Seat... \n", "391 Days Inn by Wyndham Seattle North of Downtown ... \n", "396 Travelodge by Wyndham Seattle North of Downtow... \n", "401 Comfort Inn & Suites 13700 Aurora Ave N, Seatt... \n", "441 Comfort Inn & Suites 13700 Aurora Avenue North... \n", "451 Roy Street Commons 621 12th Avenue East, Seatt... \n", "411 Roy Street Commons 621 12th Ave E, Seattle, WA... \n", "11 Crowne Plaza Seattle Downtown 1113 6th Ave, Se... \n", "61 Crowne Plaza Seattle 1113 6th Ave, Seattle, Wa... \n", "671 Watertown Hotel - A Staypineapple Hotel 4242 R... \n", "706 University Inn - A Staypineapple Hotel 4140 Ro... \n", "621 Executive Inn By The Space Needle 200 Taylor A... \n", "182 Travelodge Seattle by The Space Needle 200 6th... \n", "\n", " right_side similarity \n", "826 Pike's Place Lux Suites by Barsala 2rd Ave and... 0.715406 \n", "831 Pike's Place Lux Suites by Barsala 2nd Ave and... 0.715406 \n", "206 Holiday Inn Express & Suites Seattle City Cent... 0.712321 \n", "256 Holiday Inn Express & Suites Seattle-City Cent... 0.712321 \n", "181 Travelodge Seattle by The Space Needle 200 6th... 0.669974 \n", "211 Travelodge Seattle by The Space Needle 200 6th... 0.669974 \n", "791 citizenM Seattle South Lake Union hotel 201 We... 0.651961 \n", "836 citizenM Seattle South Lake Union hotel 201 We... 0.651961 \n", "586 Quality Inn & Suites Seattle Center 618 John S... 0.627400 \n", "551 Quality Inn & Suites Seattle Center 618 John S... 0.627400 \n", "46 Hilton Garden Inn Seattle Downtown 1821 Boren ... 0.617412 \n", "1 Hilton Garden Inn Seattle Downtown 1821 Boren ... 0.617412 \n", "781 Hyatt Regency Lake Washington At SeattleS Sout... 0.614371 \n", "746 Hyatt Regency Lake Washington At SeattleS Sout... 0.614371 \n", "346 Home2 Suites by Hilton Seattle Airport 380 Upl... 0.582787 \n", "341 Home2 Suites by Hilton Seattle Airport 380 Upl... 0.582787 \n", "561 Renaissance Seattle Hotel 515 Madison St, Seat... 0.531174 \n", "546 Renaissance Seattle Hotel 515 Madison Street, ... 0.531174 \n", "391 Travelodge by Wyndham Seattle North of Downtow... 0.528202 \n", "396 Days Inn by Wyndham Seattle North of Downtown ... 0.528202 \n", "401 Comfort Inn & Suites 13700 Aurora Avenue North... 0.527696 \n", "441 Comfort Inn & Suites 13700 Aurora Ave N, Seatt... 0.527696 \n", "451 Roy Street Commons 621 12th Ave E, Seattle, WA... 0.499497 \n", "411 Roy Street Commons 621 12th Avenue East, Seatt... 0.499497 \n", "11 Crowne Plaza Seattle 1113 6th Ave, Seattle, Wa... 0.469867 \n", "61 Crowne Plaza Seattle Downtown 1113 6th Ave, Se... 0.469867 \n", "671 University Inn - A Staypineapple Hotel 4140 Ro... 0.432854 \n", "706 Watertown Hotel - A Staypineapple Hotel 4242 R... 0.432854 \n", "621 Travelodge Seattle by The Space Needle 200 6th... 0.428319 \n", "182 Executive Inn By The Space Needle 200 Taylor A... 0.428319 " ] }, "execution_count": 45, "metadata": {}, "output_type": "execute_result" } ], "source": [ "matches_df[matches_df['similarity'] < 0.99999].sort_values(by=['similarity'], ascending=False).head(30)" ] }, { "cell_type": "code", "execution_count": 57, "metadata": { "scrolled": false }, "outputs": [ { "data": { "text/plain": [ "152" ] }, "execution_count": 57, "metadata": {}, "output_type": "execute_result" } ], "source": [ "matches_df[matches_df['similarity'] < 0.50].right_side.nunique()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.6.7" } }, "nbformat": 4, "nbformat_minor": 2 }
X Tutup