{ "cells": [ { "cell_type": "code", "execution_count": 1, "id": "97e13347-621c-4656-a175-8ebcf9d842f0", "metadata": {}, "outputs": [], "source": [ "import numpy as np\n", "import pandas as pd\n", "import math\n", "pd.options.display.max_rows = 4000" ] }, { "cell_type": "code", "execution_count": 2, "id": "aac70b32-b23f-4394-ba64-bc678a9429e7", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
area_typeavailabilitylocationsizesocietytotal_sqftbathbalconyprice
0Super built-up Area19-DecElectronic City Phase II2 BHKCoomee10562.01.039.07
1Plot AreaReady To MoveChikka Tirupathi4 BedroomTheanmp26005.03.0120.00
2Built-up AreaReady To MoveUttarahalli3 BHKNaN14402.03.062.00
3Super built-up AreaReady To MoveLingadheeranahalli3 BHKSoiewre15213.01.095.00
4Super built-up AreaReady To MoveKothanur2 BHKNaN12002.01.051.00
\n", "
" ], "text/plain": [ " area_type availability location size \\\n", "0 Super built-up Area 19-Dec Electronic City Phase II 2 BHK \n", "1 Plot Area Ready To Move Chikka Tirupathi 4 Bedroom \n", "2 Built-up Area Ready To Move Uttarahalli 3 BHK \n", "3 Super built-up Area Ready To Move Lingadheeranahalli 3 BHK \n", "4 Super built-up Area Ready To Move Kothanur 2 BHK \n", "\n", " society total_sqft bath balcony price \n", "0 Coomee 1056 2.0 1.0 39.07 \n", "1 Theanmp 2600 5.0 3.0 120.00 \n", "2 NaN 1440 2.0 3.0 62.00 \n", "3 Soiewre 1521 3.0 1.0 95.00 \n", "4 NaN 1200 2.0 1.0 51.00 " ] }, "execution_count": 2, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df = pd.read_csv(\"datasets/Bengaluru_House_Data.csv\")\n", "df.head()" ] }, { "cell_type": "code", "execution_count": 3, "id": "bdd90a24-68a5-4d9d-87aa-05331c3e0c05", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
availabilitylocationsizesocietytotal_sqftbathbalconyprice
area_type
Built-up Area24182418241812152418241023102418
Carpet Area8787875487878287
Plot Area2025202520093112025200918372025
Super built-up Area87908789879062388790874184828790
\n", "
" ], "text/plain": [ " availability location size society total_sqft bath \\\n", "area_type \n", "Built-up Area 2418 2418 2418 1215 2418 2410 \n", "Carpet Area 87 87 87 54 87 87 \n", "Plot Area 2025 2025 2009 311 2025 2009 \n", "Super built-up Area 8790 8789 8790 6238 8790 8741 \n", "\n", " balcony price \n", "area_type \n", "Built-up Area 2310 2418 \n", "Carpet Area 82 87 \n", "Plot Area 1837 2025 \n", "Super built-up Area 8482 8790 " ] }, "execution_count": 3, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.groupby(\"area_type\").agg('count')" ] }, { "cell_type": "code", "execution_count": 4, "id": "08bba0ab-e89d-43c1-9594-d3dd660be63d", "metadata": {}, "outputs": [], "source": [ "df.drop(['area_type','availability','society','balcony'],axis=1,inplace=True)" ] }, { "cell_type": "code", "execution_count": 5, "id": "ca2ade22-d6ee-402a-9e4f-1aef26b2f89e", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "(13320, 5)" ] }, "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.shape" ] }, { "cell_type": "code", "execution_count": 6, "id": "aa521583-3810-439e-aa8f-7693ba9fdbab", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "location 1\n", "size 16\n", "total_sqft 0\n", "bath 73\n", "price 0\n", "dtype: int64" ] }, "execution_count": 6, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.isnull().sum()" ] }, { "cell_type": "code", "execution_count": 7, "id": "7a17c0de-5ab0-44c6-8e5f-a8b05797b383", "metadata": {}, "outputs": [], "source": [ "df.dropna(inplace=True)" ] }, { "cell_type": "code", "execution_count": 8, "id": "bc44288e-49f6-4972-9542-7b049368caa7", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "location False\n", "size False\n", "total_sqft False\n", "bath False\n", "price False\n", "dtype: bool" ] }, "execution_count": 8, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.isnull().any()" ] }, { "cell_type": "code", "execution_count": 9, "id": "96bc00f7-97be-495e-9833-fdef0cc9edda", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
locationsizetotal_sqftbathprice
0Electronic City Phase II2 BHK10562.039.07
1Chikka Tirupathi4 Bedroom26005.0120.00
2Uttarahalli3 BHK14402.062.00
3Lingadheeranahalli3 BHK15213.095.00
4Kothanur2 BHK12002.051.00
\n", "
" ], "text/plain": [ " location size total_sqft bath price\n", "0 Electronic City Phase II 2 BHK 1056 2.0 39.07\n", "1 Chikka Tirupathi 4 Bedroom 2600 5.0 120.00\n", "2 Uttarahalli 3 BHK 1440 2.0 62.00\n", "3 Lingadheeranahalli 3 BHK 1521 3.0 95.00\n", "4 Kothanur 2 BHK 1200 2.0 51.00" ] }, "execution_count": 9, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.head()" ] }, { "cell_type": "code", "execution_count": 10, "id": "e69ba835-f0dc-4531-b45a-91fba2a62f26", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array(['2 BHK', '4 Bedroom', '3 BHK', '4 BHK', '6 Bedroom', '3 Bedroom',\n", " '1 BHK', '1 RK', '1 Bedroom', '8 Bedroom', '2 Bedroom',\n", " '7 Bedroom', '5 BHK', '7 BHK', '6 BHK', '5 Bedroom', '11 BHK',\n", " '9 BHK', '9 Bedroom', '27 BHK', '10 Bedroom', '11 Bedroom',\n", " '10 BHK', '19 BHK', '16 BHK', '43 Bedroom', '14 BHK', '8 BHK',\n", " '12 Bedroom', '13 BHK', '18 Bedroom'], dtype=object)" ] }, "execution_count": 10, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df['size'].unique()" ] }, { "cell_type": "code", "execution_count": 11, "id": "8c31f9f0-27ac-4aca-b167-792c048ab7d2", "metadata": {}, "outputs": [], "source": [ "def extractNum(s):\n", " num=0\n", " i = 0\n", " while s[i]>='0' and s[i]<='9':\n", " num = num*10+ int(s[i])\n", " i+=1\n", " return num" ] }, { "cell_type": "code", "execution_count": 12, "id": "1e83fd89-ed93-4aa2-9d7c-2ebd90f132be", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "2\n" ] } ], "source": [ "print(extractNum(\"2 bhk\"))" ] }, { "cell_type": "code", "execution_count": 13, "id": "9e70e370-1883-4c3e-8651-92bdc7e5c603", "metadata": {}, "outputs": [], "source": [ "df['size']=df['size'].apply(extractNum)" ] }, { "cell_type": "code", "execution_count": 14, "id": "477c0c01-c8c2-4833-a317-ff9fa2bdabc5", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array([ 2, 4, 3, 6, 1, 8, 7, 5, 11, 9, 27, 10, 19, 16, 43, 14, 12,\n", " 13, 18], dtype=int64)" ] }, "execution_count": 14, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df['size'].unique()" ] }, { "cell_type": "code", "execution_count": 15, "id": "3105a155-468d-4c85-a6a8-6dd3b27fb987", "metadata": {}, "outputs": [], "source": [ "df.to_csv('ygug.csv')" ] }, { "cell_type": "code", "execution_count": 16, "id": "402344c6-aaed-4de4-9b13-ad5e77bc21bb", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array(['1056', '2600', '1440', ..., '1133 - 1384', '774', '4689'],\n", " dtype=object)" ] }, "execution_count": 16, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.total_sqft.unique()" ] }, { "cell_type": "code", "execution_count": 17, "id": "ec23ac37-68ec-4ade-a189-c800fa06835c", "metadata": {}, "outputs": [], "source": [ "def rangeToMean(x):\n", " try:\n", " float(x)\n", " except:\n", " nums = x.split('-')\n", " try:\n", " float((float(nums[0])+float(nums[1]))/2)\n", " except:\n", " return None\n", " return float((float(nums[0])+float(nums[1]))/2)\n", " return float(x)" ] }, { "cell_type": "code", "execution_count": 18, "id": "a1de141c-81b3-4a1b-b664-025d2fdffc82", "metadata": {}, "outputs": [], "source": [ "df['total_sqft'] = df['total_sqft'].apply(rangeToMean)" ] }, { "cell_type": "code", "execution_count": 19, "id": "3be0c853-0947-4b15-be72-187a708fc54c", "metadata": {}, "outputs": [], "source": [ "df.dropna(inplace=True)" ] }, { "cell_type": "code", "execution_count": 20, "id": "7d44c637-c9cd-4139-a7d2-57d227da687c", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "location False\n", "size False\n", "total_sqft False\n", "bath False\n", "price False\n", "dtype: bool" ] }, "execution_count": 20, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.isna().any()" ] }, { "cell_type": "code", "execution_count": 21, "id": "b03ac292-fea6-438c-82b0-7b9a29a0ea80", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
locationsizetotal_sqftbathprice
0Electronic City Phase II21056.02.039.07
1Chikka Tirupathi42600.05.0120.00
2Uttarahalli31440.02.062.00
3Lingadheeranahalli31521.03.095.00
4Kothanur21200.02.051.00
\n", "
" ], "text/plain": [ " location size total_sqft bath price\n", "0 Electronic City Phase II 2 1056.0 2.0 39.07\n", "1 Chikka Tirupathi 4 2600.0 5.0 120.00\n", "2 Uttarahalli 3 1440.0 2.0 62.00\n", "3 Lingadheeranahalli 3 1521.0 3.0 95.00\n", "4 Kothanur 2 1200.0 2.0 51.00" ] }, "execution_count": 21, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.head()" ] }, { "cell_type": "code", "execution_count": 22, "id": "315a28ae-b560-4252-b3dc-3d926c0bcec2", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
locationsizetotal_sqftbathpriceprice_per_sqft
0Electronic City Phase II21056.02.039.073699.810606
1Chikka Tirupathi42600.05.0120.004615.384615
2Uttarahalli31440.02.062.004305.555556
3Lingadheeranahalli31521.03.095.006245.890861
4Kothanur21200.02.051.004250.000000
\n", "
" ], "text/plain": [ " location size total_sqft bath price price_per_sqft\n", "0 Electronic City Phase II 2 1056.0 2.0 39.07 3699.810606\n", "1 Chikka Tirupathi 4 2600.0 5.0 120.00 4615.384615\n", "2 Uttarahalli 3 1440.0 2.0 62.00 4305.555556\n", "3 Lingadheeranahalli 3 1521.0 3.0 95.00 6245.890861\n", "4 Kothanur 2 1200.0 2.0 51.00 4250.000000" ] }, "execution_count": 22, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df['price_per_sqft']=df['price']*100000/df['total_sqft']\n", "df.head()" ] }, { "cell_type": "code", "execution_count": null, "id": "6f5e4a78-29b9-4173-8c93-64a299b2bfff", "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "markdown", "id": "2a629008-8a90-4270-8a64-eecf0c871221", "metadata": {}, "source": [ "## Outlier Detection and Removal" ] }, { "cell_type": "code", "execution_count": 23, "id": "ac9d1ebb-749c-4729-9dd5-f8cd12c074f5", "metadata": {}, "outputs": [], "source": [ "df = df[~(df['total_sqft']/df['size']<300)]" ] }, { "cell_type": "code", "execution_count": 24, "id": "9a089a6a-0e86-4b15-9239-f4954326b8e0", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "count 12456.000000\n", "mean 6308.502826\n", "std 4168.127339\n", "min 267.829813\n", "25% 4210.526316\n", "50% 5294.117647\n", "75% 6916.666667\n", "max 176470.588235\n", "Name: price_per_sqft, dtype: float64" ] }, "execution_count": 24, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df['price_per_sqft'].describe()" ] }, { "cell_type": "code", "execution_count": 25, "id": "46907e8a-87ca-4cbb-bbc4-e37dae7a4a91", "metadata": {}, "outputs": [], "source": [ "# removing outliers for location-wise price_per_sqft \n", "df.location=df.location.apply(lambda x: x.strip())\n", "l= df.groupby('location')\n", "new_df = pd.DataFrame()\n", "for key, group in l:\n", " if(len(group)>1):\n", " m = np.mean(df['price_per_sqft'])\n", " std = np.std(df['price_per_sqft'])\n", " red_df = group[((group['price_per_sqft']>=(m-std)) & (group['price_per_sqft']<=(m+std)))]\n", " new_df = pd.concat([new_df, red_df])\n", "df = new_df" ] }, { "cell_type": "code", "execution_count": 26, "id": "ad168cf9-2709-4331-8322-c7cb2cc42cc3", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "(10959, 6)" ] }, "execution_count": 26, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.shape" ] }, { "cell_type": "code", "execution_count": 27, "id": "2493a43a-c464-41ac-b183-21a21e635acc", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array([ 3, 1, 4, 2, 5, 6, 7, 8, 9, 16, 10], dtype=int64)" ] }, "execution_count": 27, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df['size'].unique()" ] }, { "cell_type": "code", "execution_count": 28, "id": "0bb99211-d1da-4556-8b5f-9703b58a3d03", "metadata": {}, "outputs": [ { "data": { "image/png": "\n", "text/plain": [ "
" ] }, "metadata": { "needs_background": "light" }, "output_type": "display_data" }, { "data": { "text/plain": [ "
" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "import matplotlib.pyplot as plt\n", "plt.scatter(df[df['size']==2]['total_sqft'],df[df['size']==2]['price'],color='red',marker='+')\n", "plt.scatter(df[df['size']==3]['total_sqft'],df[df['size']==3]['price'],color='green',marker='.')\n", "plt.xlabel(\"total_sqft\")\n", "plt.ylabel(\"Price\")\n", "plt.figure(figsize=(15,5))\n", "plt.show()" ] }, { "cell_type": "markdown", "id": "cca9fddd-d8fe-4de2-baf3-8cf3d0b1e7fd", "metadata": {}, "source": [ "## Visualisation with datashader" ] }, { "cell_type": "code", "execution_count": 29, "id": "824fabbb-6e9b-45cc-98b0-e17d7df3993e", "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "C:\\Users\\Abhay\\anaconda3\\lib\\site-packages\\dask\\dataframe\\utils.py:369: FutureWarning: pandas.Int64Index is deprecated and will be removed from pandas in a future version. Use pandas.Index with the appropriate dtype instead.\n", " _numeric_index_types = (pd.Int64Index, pd.Float64Index, pd.UInt64Index)\n", "C:\\Users\\Abhay\\anaconda3\\lib\\site-packages\\dask\\dataframe\\utils.py:369: FutureWarning: pandas.Float64Index is deprecated and will be removed from pandas in a future version. Use pandas.Index with the appropriate dtype instead.\n", " _numeric_index_types = (pd.Int64Index, pd.Float64Index, pd.UInt64Index)\n", "C:\\Users\\Abhay\\anaconda3\\lib\\site-packages\\dask\\dataframe\\utils.py:369: FutureWarning: pandas.UInt64Index is deprecated and will be removed from pandas in a future version. Use pandas.Index with the appropriate dtype instead.\n", " _numeric_index_types = (pd.Int64Index, pd.Float64Index, pd.UInt64Index)\n" ] }, { "data": { "image/png": "iVBORw0KGgoAAAANSUhEUgAAAOcAAADnCAYAAADl9EEgAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjMuNCwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8QVMy6AAAACXBIWXMAAAsTAAALEwEAmpwYAAADeUlEQVR4nO3cMc5MYRSA4bnCBsQK7OBP9CqFHYhVWMKd3gpswRJYgkStVgmVjr+4GqIZV3Xc1+R5ypnkfNO8OckUZ9m27QT03Dn6BwCXiROixAlR4oQocULU3b0vz+ezv3Lhp3VdR+Zu27Zc+tzmhChxQpQ4IUqcECVOiBInRIkTosQJUeKEKHFClDghSpwQJU6IEidEiROixAlR4oQocUKUOCFKnBAlTojavb4H/DZ1fe9PbE6IEidEiROixAlR4oQocUKUOCFKnBAlTogSJ0SJE6LECVHihChxQpQ4IUqcECVOiBInRIkTosQJUeKEqKu6vjdxHe1fX1yDX2xOiBInRIkTosQJUeKEKHFClDghSpwQJU6IEidEiROixAlR4oQocUKUOCFKnBAlTogSJ0SJE6LECVGHHPh6M3Q0yzEuronNCVHihChxQpQ4IUqcECVOiBInRIkTosQJUeKEKHFClDghSpwQJU6IEidEiROixAlR4oQocUKUOCFKnBB1yPW9h8vM3PcD1/du/rNLgS4QXg+bE6LECVHihChxQpQ4IUqcECVOiBInRIkTosQJUeKEKHFClDghSpwQJU6IEidEiROixAlR4oQocUKUOCFq9/re1/M68uizoZt/r25n5k5wJY+/sTkhSpwQJU6IEidEiROixAlR4oQocUKUOCFKnBAlTogSJ0SJE6LECVHihChxQpQ4IUqcECVOiBInRIkTonbv4L18PvPo29czc7/MjIVD2JwQJU6IEidEiROixAlR4oQocUKUOCFKnBAlTogSJ0SJE6LECVHihChxQpQ4IUqcECVOiBInRIkTosQJUbvX95YX90YeffzhdmTug3cjY+EQNidEiROixAlR4oQocUKUOCFKnBAlTogSJ0SJE6LECVHihChxQpQ4IUqcECVOiBInRIkTosQJUeKEqN0DX6dH30ce/fZpGZl7s64jc+EINidEiROixAlR4oQocUKUOCFKnBAlTogSJ0SJE6LECVHihChxQpQ4IUqcECVOiBInRIkTosQJUeKEKHFC1P71vc/3Rx59+nFk7OnJzFg4hM0JUeKEKHFClDghSpwQJU6IEidEiROixAlR4oQocUKUOCFKnBAlTogSJ0SJE6LECVHihChxQpQ4IUqcELVs23b0bwAusDkhSpwQJU6IEidEiROixAlRPwAfPC9vgQglngAAAABJRU5ErkJggg==\n", "text/plain": [ "
" ] }, "metadata": { "needs_background": "light" }, "output_type": "display_data" } ], "source": [ "import datashader as ds\n", "import colorcet as cc\n", "bhk_2 = df[df['size']==2]\n", "x_r = [bhk_2['total_sqft'].min(),bhk_2['total_sqft'].max()]\n", "y_r = [bhk_2['price'].min(),bhk_2['price'].max()]\n", "cvs = ds.Canvas(plot_width=10, plot_height=10,x_range=x_r,y_range=y_r) # auto range or provide the `bounds` argument\n", "agg = cvs.points(bhk_2, 'total_sqft', 'price') # this is the histogram\n", "img = ds.tf.set_background(ds.tf.shade(agg, how=\"log\", cmap=cc.fire), \"grey\").to_pil() # create a rasterized image\n", "plt.imshow(img)\n", "plt.axis('off')\n", "plt.show()" ] }, { "cell_type": "code", "execution_count": 30, "id": "3258accc-e592-4db1-9354-bdb81c1b0081", "metadata": {}, "outputs": [], "source": [ "def bhk_outlier(d,location):\n", " plt.title(location)\n", " plt.xlabel(\"total Sqft Area\")\n", " plt.ylabel(\"Price\")\n", " d_loc = d[d['location']==location]\n", " bhk_2 = d_loc[d_loc['size']==2]\n", " bhk_3 = d_loc[d_loc['size']==3]\n", " plt.scatter(bhk_2['total_sqft'],bhk_2['price'],marker='+',label='2 bhk',color='green')\n", " plt.scatter(bhk_3['total_sqft'],bhk_3['price'],marker='.',label='3_bhk',color='blue')\n", " plt.legend()\n", " plt.show()" ] }, { "cell_type": "code", "execution_count": 31, "id": "dea56f46-e3b4-448b-bbc4-7f9cfaf4a10f", "metadata": {}, "outputs": [], "source": [ "# unique_locations = df.location.unique()\n", "# for location in unique_locations:\n", "# bhk_outlier(df,location)\n", "# print('\\n\\n\\n')" ] }, { "cell_type": "code", "execution_count": 32, "id": "f96c2b1e-7efe-4d5a-8692-289d76e7aeab", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
locationsizetotal_sqftbathpriceprice_per_sqft
56101st Block BEL Layout31540.03.085.05519.480519
84701st Block HBR Layout1600.01.045.07500.000000
125641st Block HBR Layout43150.04.0150.04761.904762
23081st Block HRBR Layout32300.03.080.03478.260870
77151st Block HRBR Layout21250.02.067.05360.000000
\n", "
" ], "text/plain": [ " location size total_sqft bath price price_per_sqft\n", "5610 1st Block BEL Layout 3 1540.0 3.0 85.0 5519.480519\n", "8470 1st Block HBR Layout 1 600.0 1.0 45.0 7500.000000\n", "12564 1st Block HBR Layout 4 3150.0 4.0 150.0 4761.904762\n", "2308 1st Block HRBR Layout 3 2300.0 3.0 80.0 3478.260870\n", "7715 1st Block HRBR Layout 2 1250.0 2.0 67.0 5360.000000" ] }, "execution_count": 32, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.head()" ] }, { "cell_type": "code", "execution_count": 33, "id": "394d151c-c806-4d0a-be81-f91f8e5f5ead", "metadata": {}, "outputs": [], "source": [ "def bhk_outlier_remover():\n", " exclude_indices = np.array([])\n", " for loc,loc_df in df.groupby('location'):\n", " # creating stats {mean,count,std} for each bhk\n", " stats = {}\n", " bhks = loc_df.groupby('size')\n", " for bhk,bhk_group in bhks:\n", " stats[bhk] = {\n", " 'mean':np.mean(bhk_group['price_per_sqft']) ,\n", " 'std': np.std(bhk_group['price_per_sqft']),\n", " 'count':bhk_group.shape[0]\n", " }\n", "# print(stats)\n", " # adding indexes coressponding to rows where price_per_sqft of bhk_n is less than mean of price_per_sqft of bhk_n-1 into exclude_indices\n", " for bhk,bhk_group in bhks:\n", " stats_prev = stats.get(bhk-1)\n", " if stats_prev and stats_prev['count']>5:\n", " index_to_del = bhk_group[bhk_group['price_per_sqft'] < (stats_prev['mean'])].index.values\n", " exclude_indices = np.append(exclude_indices,index_to_del)\n", " return df.drop(exclude_indices)" ] }, { "cell_type": "code", "execution_count": 34, "id": "c2bf8e0f-6e97-4a4e-8a71-1334a0f88efd", "metadata": {}, "outputs": [], "source": [ "df2 = bhk_outlier_remover()" ] }, { "cell_type": "code", "execution_count": 35, "id": "63c7ebf2-a16b-4be3-bd58-b87890b5354b", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "(8764, 6)" ] }, "execution_count": 35, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df2.shape" ] }, { "cell_type": "code", "execution_count": 36, "id": "9d0156ce-912a-406f-83f7-7288e9ab83a4", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "(10959, 6)" ] }, "execution_count": 36, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.shape" ] }, { "cell_type": "code", "execution_count": 37, "id": "95ebb8af-8b2f-45c8-a82b-887a57ec6a08", "metadata": { "tags": [] }, "outputs": [], "source": [ "# df.location =df.location.apply(lambda x: x.strip())\n", "# location_stats = df.groupby('location')['location'].agg('count')\n", "# location_stats" ] }, { "cell_type": "code", "execution_count": 38, "id": "18a3b81b-487c-4fc9-b010-aae5a63f0e4c", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "752" ] }, "execution_count": 38, "metadata": {}, "output_type": "execute_result" } ], "source": [ "len(df2.location.unique())" ] }, { "cell_type": "markdown", "id": "61531f21-5d53-48d5-a2ed-b73c127d77f4", "metadata": {}, "source": [ "### model training" ] }, { "cell_type": "code", "execution_count": 39, "id": "eb533a5c-70dc-4735-821d-4f35fc65a0bd", "metadata": {}, "outputs": [], "source": [ "df2.drop('price_per_sqft',axis=1,inplace=True)" ] }, { "cell_type": "code", "execution_count": 40, "id": "b272fca5-4feb-4bd0-9caa-dec1cc875626", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
locationsizetotal_sqftbathprice
56101st Block BEL Layout31540.03.085.0
84701st Block HBR Layout1600.01.045.0
125641st Block HBR Layout43150.04.0150.0
23081st Block HRBR Layout32300.03.080.0
77151st Block HRBR Layout21250.02.067.0
\n", "
" ], "text/plain": [ " location size total_sqft bath price\n", "5610 1st Block BEL Layout 3 1540.0 3.0 85.0\n", "8470 1st Block HBR Layout 1 600.0 1.0 45.0\n", "12564 1st Block HBR Layout 4 3150.0 4.0 150.0\n", "2308 1st Block HRBR Layout 3 2300.0 3.0 80.0\n", "7715 1st Block HRBR Layout 2 1250.0 2.0 67.0" ] }, "execution_count": 40, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df2.head()" ] }, { "cell_type": "code", "execution_count": 41, "id": "2a2ec5e7-d63a-463c-a447-5f0b6ef818b7", "metadata": {}, "outputs": [], "source": [ "from sklearn.preprocessing import OneHotEncoder" ] }, { "cell_type": "code", "execution_count": 42, "id": "03a87d19-bd9a-46bf-a1fb-80d58250e54e", "metadata": {}, "outputs": [], "source": [ "ohe = OneHotEncoder()" ] }, { "cell_type": "code", "execution_count": 43, "id": "c391eb23-71b2-4dd0-9bbd-7d067ddbe92e", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "OneHotEncoder()" ] }, "execution_count": 43, "metadata": {}, "output_type": "execute_result" } ], "source": [ "ohe.fit(df2[['location']])" ] }, { "cell_type": "code", "execution_count": 44, "id": "1f2d6a85-6d02-43be-a513-f1b9fa5b6258", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "(8764, 752)" ] }, "execution_count": 44, "metadata": {}, "output_type": "execute_result" } ], "source": [ "location_encoding = ohe.transform(df2[['location']]).toarray()\n", "location_encoding.shape" ] }, { "cell_type": "code", "execution_count": 45, "id": "04a57321-7031-465e-a507-8610abdb3b82", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "[array(['1st Block BEL Layout', '1st Block HBR Layout',\n", " '1st Block HRBR Layout', '1st Block Jayanagar',\n", " '1st Block Koramangala', '1st Phase JP Nagar',\n", " '1st Stage Indira Nagar', '2nd Block Bel Layout',\n", " '2nd Block Hrbr Layout', '2nd Block Jayanagar',\n", " '2nd Phase JP Nagar', '2nd Phase Judicial Layout',\n", " '2nd Stage Arekere Mico Layout', '2nd Stage Nagarbhavi',\n", " '3rd Block Banashankari', '3rd Block Hrbr Layout',\n", " '3rd Block Jayanagar', '3rd Block Koramangala',\n", " '3rd Phase JP Nagar', '4th Block Jayanagar',\n", " '4th Block Koramangala', '4th Phase JP Nagar',\n", " '4th T block Jayanagar', '5th Block Hbr Layout',\n", " '5th Phase JP Nagar', '5th Stage BEML Layout',\n", " '6th Phase JP Nagar', '6th block Koramangala',\n", " '7th Block Jayanagar', '7th Phase JP Nagar', '8th Block Jayanagar',\n", " '8th Phase JP Nagar', '8th block Koramangala',\n", " '9th Phase JP Nagar', 'A Narayanapura', 'AECS Layout',\n", " 'AGS Layout', 'AMS Layout', 'Abbaiah Reddy Layout', 'Abbigere',\n", " 'Adityanagar', 'Agrahara Dasarahalli', 'Aishwarya Crystal Layout',\n", " 'Akshaya Nagar', 'Akshaya Vana', 'Akshayanagara East',\n", " 'Akshayanagara West', 'Akshya Nagar', 'Alfa Garden Layout', 'Alur',\n", " 'Amam Enclave Layout', 'Amarjyothi Colony', 'Ambalipura',\n", " 'Ambedkar Colony', 'Ambedkar Nagar', 'Amblipura', 'Amruthahalli',\n", " 'Amruthnagar', 'Anand Nagar', 'Anand nagar', 'Anandapura',\n", " 'Anantapura', 'Ananth Nagar', 'Anekal', 'Anjanapura',\n", " 'Anjappa Layout', 'Ankappa Layout', 'Annaiah Reddy Layout',\n", " 'Annapurneshwari Nagar', 'Anugrah Layout', 'Anwar Layout',\n", " 'Ardendale', 'Arehalli', 'Arekere', 'Ashirvad Colony',\n", " 'Ashok Nagar', 'Ashwath Nagar', 'Ashwathnagar', 'Ashwini layout',\n", " 'Atmananda Colony', 'Attibele', 'Attur Layout', 'Austin Town',\n", " 'Avalahalli', 'Ayappa Nagar', 'B Channasandra', 'B Narayanapura',\n", " 'BCC Layout', 'BCMC Layout', 'BEL Road', 'BEML Layout',\n", " 'BHEL Layout', 'BSM Extension', 'BTM 1st Stage', 'BTM 2nd Stage',\n", " 'BTM 4th Stage', 'BTM Layout', 'Baba Nagar', 'Babusapalaya',\n", " 'Badavala Nagar', 'Bagalakunte', 'Bagalur', 'Bagalur Main Road',\n", " 'Balagere', 'Balaji Gardens Layout', 'Banagiri Nagar',\n", " 'Banashankari', 'Banashankari Stage II', 'Banashankari Stage III',\n", " 'Banashankari Stage V', 'Banashankari Stage VI', 'Banaswadi',\n", " 'Banjara Layout', 'Bank Of Baroda Colony', 'Bannerghatta',\n", " 'Bannerghatta Road', 'Basapura', 'Basava Nagar', 'Basavanagara',\n", " 'Basavanapura', 'Basavangudi', 'Basavanna Nagar',\n", " 'Basaveshwara Nagar', 'Basaveshwara Nagar Yelahanka',\n", " 'Battarahalli', 'Begur', 'Begur Road', 'Belathur', 'Belatur',\n", " 'Bellandur', 'Bellari Road', 'Bendiganahalli', 'Benson Town',\n", " 'Bethel Nagar', 'Bettahalsoor', 'Bhagyalakshmi Avenue',\n", " 'Bharathi Nagar', 'Bhoganhalli', 'Bhoopsandra',\n", " 'Bhuvaneshwari Nagar', 'Bhuvaneswari Nagar', 'Bidadi',\n", " 'Bidrahalli', 'Bikasipura', 'Bileshivale', 'Billekahalli',\n", " 'Binny Pete', 'Bisuvanahalli', 'Bommanahalli', 'Bommasandra',\n", " 'Bommasandra Industrial Area', 'Bommenahalli', 'Brindavan Layout',\n", " 'Brindavan Nagar', 'Brooke Bond First Cross', 'Brookefield',\n", " 'Budigere', 'Byadarahalli', 'Byagadadhenahalli', 'Byatarayanapura',\n", " 'Byrasandra', 'Byrathi Village', 'CQAL Layout', 'CV Raman Nagar',\n", " 'Cambridge Layout', 'Canara Bank Colony', 'Canara Bank Layout',\n", " 'Carmelaram', 'Celebrity Paradise Layout', 'Challaghatta',\n", " 'Chamrajpet', 'Chamundi Nagar', 'Chandapura', 'Chandra Layout',\n", " 'Channasandra', 'Channasandra Layout', 'Chelekare',\n", " 'Chennammana Kere', 'Chennammanakere Achukattu',\n", " 'Chennappa Layout', 'Chikka Banaswadi', 'Chikka Tirupathi',\n", " 'Chikkabanavar', 'Chikkadunnasandra', 'Chikkakannalli',\n", " 'Chikkalasandra', 'Chikkasandra', 'Chikkathoguru',\n", " 'Chinnapanahalli', 'Chokkanahalli', 'Cholanayakanahalli',\n", " 'Choodasandra', 'Classic Paradise Layout', 'Cleveland Town',\n", " 'Coconut Grove Layout', 'Coffee Board Layout', 'Cooke Town',\n", " 'Cottonpet', 'Cox Town', 'Crimson Layout',\n", " 'D Group Employees Layout', 'Daadys Gaarden Layout',\n", " 'Dairy Circle', 'Dasanapura', 'Dasarahalli', 'Dena Bank Colony',\n", " 'Devanahalli', 'Devanahalli Int. Airport', 'Devarabeesana Halli',\n", " 'Devarachikkanahalli', 'Devasthanagalu', 'Devi Nagar', 'Dinnur',\n", " 'Divya Unnathi Layout', 'Doctors Layout', 'Dodda Banaswadi',\n", " 'Dodda Kempaiah Layout', 'Dodda Nekkundi',\n", " 'Dodda Nekkundi Extension', 'Doddaballapur', 'Doddabanahalli',\n", " 'Doddabidrakallu', 'Doddabommasandra', 'Doddagubbi',\n", " 'Doddakallasandra', 'Doddakammanahalli', 'Doddakannelli',\n", " 'Doddanakundi Industrial Area 2', 'Doddanekundi', 'Doddathoguru',\n", " 'Dodsworth Layout', 'Dollar Scheme Colony', 'Dollars Colony',\n", " 'Dollars Layout', 'Domlur', 'Domlur Layout', 'Dommasandra',\n", " 'Doopanahalli', 'Dooravani Nagar', 'Dr Shivarama Karantha Nagar',\n", " 'Dwarka Nagar', 'ECC Road, Whitefield,', 'EPIP Zone', 'Ejipura',\n", " 'Electronic City', 'Electronic City Phase II',\n", " 'Electronic city Phase 1,', 'Electronics City Phase 1',\n", " 'Esther Enclave Layout', 'Ferrar Nagar', 'Frazer Town',\n", " 'Friends Colony', 'GD Layout', 'GM Palaya', 'Ganesha Block',\n", " 'Ganga Nagar', 'Garebhavipalya', 'Garudachar Palya', 'Gattahalli',\n", " 'Gaurava Nagar', 'Geddalahalli', 'Giri Nagar', 'Gkvk Layout',\n", " 'Glass Factory Layout', 'Gnana Bharathi', 'Gokula Extension',\n", " 'Gollahalli', 'Gollarapalya Hosahalli', 'Gopalapura',\n", " 'Gopalkrishna Nagar', 'Gottigere', 'Govindapura',\n", " 'Govindaraja Nagar Ward', 'Govindpura', 'Gowdanapalya',\n", " 'Green Domain Layout', 'Green Garden Layout', 'Green Glen Layout',\n", " 'Green View Layout', 'Green Woods Layout', 'Gubbalala',\n", " 'Guddadahalli', 'Gulimangala', 'Gunjur', 'Gunjur Palya',\n", " 'HAL 2nd Stage', 'HAL 3rd Stage', 'HBR Layout', 'HMT Layout',\n", " 'HOSUR MAIN ROAD', 'HRBR Layout', 'HSR Layout', 'Hadosiddapura',\n", " 'Hagadur', 'Hallehalli', 'Hanumanth Nagar', 'Hanumantha Nagar',\n", " 'Haralur Road', 'Harappanahalli', 'Harlur', 'Harsha Layout',\n", " 'Hebbal', 'Hebbal Kempapura', 'Hegde Nagar', 'Hegganahalli',\n", " 'Hennur', 'Hennur Bande', 'Hennur Gardens', 'Hennur Road',\n", " 'Herohalli', 'Hessarghatta', 'Himagiri Meadows', 'Hiremath Layout',\n", " 'Hongasandra', 'Hoodi', 'Hoodi Circle,', 'Hoodi Layout',\n", " 'Horamavu Agara', 'Horamavu Banaswadi', 'Hormavu', 'Hosa Road',\n", " 'Hosahalli Extension', 'Hosakerehalli', 'Hosakerehalli Layout',\n", " 'Hosapalya', 'Hoskote', 'Hosur Road', 'Hoysalanagar', 'Hulimavu',\n", " 'Huskur', 'ISRO Layout', 'ITI Layout', 'ITPL', 'Iblur Village',\n", " 'Immadihalli', 'Indira Nagar', 'Ittamadu', 'J C Nagar',\n", " 'JCR Layout', 'JP Nagar', 'JP Nagar 7th Phase,',\n", " 'JP Nagar 8th Phase,', 'Jai Bheema Nagar', 'Jakkasandra Extension',\n", " 'Jakkur', 'Jakkur Plantation', 'Jakkuru Layout', 'Jalahalli',\n", " 'Jalahalli East', 'Jalahalli West', 'Janatha Colony',\n", " 'Jaya Mahal layout', 'Jayamahal', 'Jayanagar', 'Jayanti Nagar',\n", " 'Jeevan bima nagar', 'Jigani', 'Jinkethimmanahalli',\n", " 'Jnana Ganga Nagar', 'Jnanabharathi Layout', 'Judicial Layout',\n", " 'Judicial Layout, Kanakapura Road,', 'Jyothi Nagar', 'KEB Colony',\n", " 'KR Garden', 'KR Layout', 'KR Puram', 'KSRTC Layout',\n", " 'KUDLU MAIN ROAD', 'Kachanayakanahalli', 'Kacharakanahalli',\n", " 'Kada Agrahara', 'Kadabagere', 'Kadubeesanahalli', 'Kadugodi',\n", " 'Kadugondanahalli', 'Kaggadasapura', 'Kaggalipura',\n", " 'Kaikondrahalli', 'Kalena Agrahara', 'Kalkere', 'Kallumantapa',\n", " 'Kalyan nagar', 'Kamakshipalya', 'Kamakya Layout', 'Kamala Nagar',\n", " 'Kambipura', 'Kammagondahalli', 'Kammanahalli', 'Kammasandra',\n", " 'Kanaka Nagar', 'Kanakapura', 'Kanakpura Road', 'Kannamangala',\n", " 'Kariyammana Agrahara', 'Karuna Nagar', 'Kasavanhalli',\n", " 'Kashi Nagar', 'Kasturi Nagar', 'Kathriguppe', 'Kattigenahalli',\n", " 'Kaval Byrasandra', 'Kaverappa Layout', 'Kaveri Nagar',\n", " 'Kempapura', 'Kempegowda Nagar', 'Kenchenahalli', 'Kenchenhalli',\n", " 'Kengeri', 'Kengeri Hobli', 'Kengeri Satellite Town',\n", " 'Kereguddadahalli', 'Keshava Nagar', 'Kirloskar Layout',\n", " 'Kithaganur', 'Kodathi', 'Kodbisanhalli', 'Kodichikkanahalli',\n", " 'Kodigehaali', 'Kodigehalli', 'Kodihalli', 'Kodipalya', 'Kogilu',\n", " 'Konanakunte', 'Konanakunte Cross', 'Konena Agrahara',\n", " 'Koramangala', 'Koramangala Industrial Layout', 'Kothannur',\n", " 'Kothanur', 'Kothnoor Dinne', 'Krishna Reddy Layout', 'Kudlu',\n", " 'Kudlu Gate', 'Kullappa Colony', 'Kumara Park', 'Kumarapalli',\n", " 'Kumaraswami Layout', 'Kumbena Agrahara', 'Kundalahalli',\n", " 'Kundalahalli Colony', 'Kurubarahalli', 'Kuvempu Nagar',\n", " 'Kyalasanahalli', 'LB Shastri Nagar', 'Laggere', 'Lake City',\n", " 'Lakshmiamma Garden', 'Lakshminarayana Pura', 'Lakshmipura',\n", " 'Lal Bahadur Shastri Nagar', 'Langford Town', 'Lavakusha Nagar',\n", " 'Laxmi Sagar Layout', 'Lingadheeranahalli', 'Lingarajapuram',\n", " 'Lottegolla Halli', 'MCECHS layout', 'MLA Layout', 'MS Pallya',\n", " 'Madiwala', 'Magadi Road', 'Mahadevpura', 'Mahaganapathy Nagar',\n", " 'Mahalakshmi Layout', 'Mahalakshmi Puram', 'Maithri Layout',\n", " 'Makali', 'Mallasandra', 'Mallathahalli', 'Malleshpalya',\n", " 'Malleshwaram', 'Manayata Tech Park', 'Mangammanapalya',\n", " 'Manjunatha Layout', 'Manorayana Palya', 'Maragondanahalli',\n", " 'Marasandra', 'Marathahalli', 'Marenahalli', 'Margondanahalli',\n", " 'Mariyannapalya', 'Marsur', 'Maruthi Nagar', 'Maruthi Sevanagar',\n", " 'Mathikere', 'Mathikere Extension', 'Medahalli', 'Meenakunte',\n", " 'Mico Layout', 'Moodalapalya', 'Motappa Layout',\n", " 'Muneshwara Nagar', 'Munivenkatppa Layout', 'Munnekollal',\n", " 'Murugeshpalya', 'Muthurayya Swamy Layout', 'Mylasandra',\n", " 'Mysore Highway', 'Mysore Road', 'NGR Layout', 'NRI Layout',\n", " 'NS Palya', 'Nagadevanahalli', 'Naganathapura',\n", " 'Nagappa Reddy Layout', 'Nagaraja Garden', 'Nagarbhavi',\n", " 'Nagasandra', 'Nagashetty Halli', 'Nagavara', 'Nagavarapalya',\n", " 'Nagawara Junction', 'Nagondanahalli', 'Naidu Layout',\n", " 'Nallurhalli', 'Nandi Durga Road', 'Nandi Hills', 'Nandini Layout',\n", " 'Nanjappa Garden', 'Nanjappa Layout', 'Narayana Nagar 1st Block',\n", " 'Narayanapura', 'Nayandanahalli', 'Near International Airport',\n", " 'Neeladri Nagar', 'Neelamangala', 'Nehru Nagar', 'Nelamangala',\n", " 'New Gurappana Palya', 'New Thippasandra', 'Ngef Layout',\n", " 'Nobo Nagar', 'Nyanappana Halli', 'OLd Gurappanapalya',\n", " 'OMBR Layout', 'Off Sarjapur Road,', 'Old Airport Road',\n", " 'Old Madras Road', 'Omarbagh Layout', 'Omkar Nagar',\n", " 'Outer Ring Road East', 'P&T Layout', 'Padmanabhanagar',\n", " 'Pai Layout', 'Palace Road', 'Pampa Extension', 'Panathur',\n", " 'Panduranga Nagar', 'Parappana Agrahara', 'Patelappa Layout',\n", " 'Pattanagere', 'Pattandur Agrahara', 'Peenya',\n", " 'Phase 1 Kammasandra', 'Poorna Pragna Layout',\n", " 'Poornapragna Housing Society Layout', 'Pragathi Nagar',\n", " 'Prashanth Nagar', 'Prithvi Layout', 'Pulkeshi Nagar',\n", " 'Puttanahalli', 'R.T. Nagar', 'RMV', 'RMV 2nd Stage',\n", " 'RMV Extension', 'RMV Extension Stage 2', 'RPC layout',\n", " 'RR Layout', 'RWF West Colony', 'Rachenahalli',\n", " 'Raghavendra Layout', 'Raghavendra Nagar', 'Raghuvanahalli',\n", " 'Raja Rajeshwari Nagar', 'Raja Rajeshwari Nagar 5th Stage',\n", " 'Rajaji Nagar', 'Rajankunte', 'Rajarajeshwari Nagara',\n", " 'Rajarajeshwari nagar', 'Rajarajeshwarinagar', 'Rajasree Layout',\n", " 'Rajiv Gandhi Nagar', 'Rajiv Nagar', 'Ramagondanahalli',\n", " 'Ramakrishnappa Layout', 'Ramamurthy Nagar',\n", " 'Ramamurthy Nagar Extension', 'Ramanashree Enclave',\n", " 'Ramanjaneyanagar', 'Ramesh Nagar', 'Rayasandra',\n", " 'Reliaable Tranquil Layout', 'Remco Bhel Layout', 'Richards Town',\n", " 'Richmond Town', 'Roopena Agrahara', 'Rustam Bagh Layout',\n", " 'SRINIVASAPURA', 'Sadanand Nagar', 'Sadaramangala',\n", " 'Sahakara Nagar', 'Sai Gardens', 'Samethanahalli',\n", " 'Sampangi Rama Nagar', 'Sampigehalli', 'Sanjay nagar',\n", " 'Sanjeevini Nagar', 'Sanne Amanikere', 'Sarakki Nagar', 'Sarjapur',\n", " 'Sarjapur Road', 'Sarjapur Road,', 'Sarjapura - Attibele Road',\n", " 'Sarvabhouma Nagar', 'Sathya Layout', 'Sathya Sai Layout',\n", " 'Sector 1 HSR Layout', 'Sector 2 HSR Layout',\n", " 'Sector 6 HSR Layout', 'Sector 7 HSR Layout', 'Seegehalli',\n", " 'Seetharampalya', 'Seshadripuram', 'Shampura', 'Shankarapuram',\n", " 'Shanthi Layout', 'Shanti Nagar', 'Shantiniketan Layout',\n", " 'Shettigere', 'Shetty Halli', 'Shikaripalya', 'Shingapura',\n", " 'Shirdi Sai Layout', 'Shivaji Nagar', 'Shree Ananth Nagar Layout',\n", " 'Siddapura', 'Sidedahalli', 'Silk Board', 'Silver Springs Layout',\n", " 'Singanayakanahalli', 'Singapura Village', 'Singasandra',\n", " 'Singena Agrahara', 'Sneha Colony', 'Somasundara Palya', 'Sompura',\n", " 'Sonnenahalli', 'Soundarya Layout', 'Sri Balaji Krupa Layout',\n", " 'Sri Sai Layout', 'Sri Venkateshpura Layout', 'Srinagar',\n", " 'Srinivasa Nagar', 'Srirampura', 'Srirampuram', \"St. John's Road\",\n", " 'Stage-4 Bommanahalli', 'Subash Nagar', 'Subramanyapura',\n", " 'Suddaguntepalya', 'Sultan Palaya', 'Sunder Ram Shetty Nagar',\n", " 'Sunkadakatte', 'Surabhi Layout', 'Suraksha Nagar',\n", " 'Syndicate Bank Colony', 'T Dasarahalli', 'T.C PALYA', 'TC Palaya',\n", " 'Tala Cauvery Layout', 'Talaghattapura', 'Tasker Town',\n", " 'Tata Nagar', 'Tavarekere', 'Teachers Colony', 'Tejaswini Nagar',\n", " 'Telecom Layout', 'Thanisandra', 'Thanisandra Main Road,',\n", " 'Thigalarapalya', 'Thippasandra', 'Thirumenahalli', 'Thomas Town',\n", " 'Thubarahalli', 'Thyagaraja Nagar', 'Tigalarpalya', 'Tindlu',\n", " 'Tirumanahalli', 'Tumkur Road', 'Tunganagara', 'Udaya Nagar',\n", " 'Udayapur Village', 'Ullal Uppanagar', 'Ulsoor',\n", " 'Upadhyaya Layout', 'Upkar Layout', 'Uttarahalli', 'VGP Layout',\n", " 'VHBCS Layout', 'Vadarpalya', 'Vaderahalli', 'Vaishnavi Layout',\n", " 'Vajarahalli', 'Varanasi', 'Varsova Layout', 'Varthur',\n", " 'Varthur Road', 'Varthur Road,', 'Vasantha Vallabha Nagar',\n", " 'Vasanthapura', 'Veer Sandra', 'Veerannapalya', 'Veersandra',\n", " 'Venkatadri Layout', 'Venkatapura', 'Venugopal Reddy Layout',\n", " 'Vibuthipura', 'Victoria Layout', 'Vidyaranyapura',\n", " 'Vignana Nagar', 'Vijaya Bank Layout', 'Vijayanagar', 'Vijinapura',\n", " 'Vimanapura', 'Vinayak Nagar', 'Vinayaka Nagar', 'Virat Nagar',\n", " 'Virupakshapura', 'Vishveshwarya Layout',\n", " 'Vishwanatha Nagenahalli', 'Vishwapriya Layout',\n", " 'Vishwapriya Nagar', 'Vittal Nagar', 'Vittasandra', 'Vivek Nagar',\n", " 'Volagerekallahalli', 'Weavers Colony', 'Whitefield',\n", " 'Whitefield,', 'Wilson Garden', 'Yarandahalli', 'Yelachenahalli',\n", " 'Yelahanka', 'Yelahanka New Town', 'Yelenahalli', 'Yemlur',\n", " 'Yeshwanthpur', 'Yeshwanthpur Industrial Suburb', 'cooketown',\n", " 'manyata park', 'tc.palya'], dtype=object)]" ] }, "execution_count": 45, "metadata": {}, "output_type": "execute_result" } ], "source": [ "ohe.categories_" ] }, { "cell_type": "code", "execution_count": 46, "id": "4db98a15-351d-4aaa-b296-0177bb60cd94", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array([[0.00e+00, 0.00e+00, 0.00e+00, ..., 3.00e+00, 1.54e+03, 3.00e+00],\n", " [1.00e+00, 0.00e+00, 0.00e+00, ..., 1.00e+00, 6.00e+02, 1.00e+00],\n", " [1.00e+00, 0.00e+00, 0.00e+00, ..., 4.00e+00, 3.15e+03, 4.00e+00],\n", " ...,\n", " [0.00e+00, 0.00e+00, 0.00e+00, ..., 2.00e+00, 8.80e+02, 2.00e+00],\n", " [0.00e+00, 0.00e+00, 0.00e+00, ..., 2.00e+00, 1.00e+03, 2.00e+00],\n", " [0.00e+00, 0.00e+00, 0.00e+00, ..., 3.00e+00, 1.40e+03, 2.00e+00]])" ] }, "execution_count": 46, "metadata": {}, "output_type": "execute_result" } ], "source": [ "x = np.append(location_encoding[:,1:],np.array(df2.drop(['location','price'],axis=1)),axis=1)\n", "x" ] }, { "cell_type": "code", "execution_count": null, "id": "1648f0c5-19ba-474f-8e4b-72fa066a6972", "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": 47, "id": "44a3f1b9-c16e-4829-893c-598a042819e0", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "(8764,)" ] }, "execution_count": 47, "metadata": {}, "output_type": "execute_result" } ], "source": [ "y = df2['price']\n", "y.shape" ] }, { "cell_type": "code", "execution_count": 48, "id": "91524bc6-2a66-4543-a4d3-75b19c9c5a70", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "(8764, 754)" ] }, "execution_count": 48, "metadata": {}, "output_type": "execute_result" } ], "source": [ "x.shape" ] }, { "cell_type": "code", "execution_count": 49, "id": "5f82b539-6aa7-4546-967a-40558898c55b", "metadata": {}, "outputs": [], "source": [ "from sklearn.model_selection import train_test_split\n", "x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.2)" ] }, { "cell_type": "code", "execution_count": 50, "id": "4a3e86aa-e91a-43f0-865a-889d0b4f3ae1", "metadata": {}, "outputs": [], "source": [ "from sklearn.linear_model import LinearRegression\n", "lr = LinearRegression()" ] }, { "cell_type": "code", "execution_count": 51, "id": "f313918d-6d8b-4868-8f3b-b793de473bdb", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "-1667207110599217.2" ] }, "execution_count": 51, "metadata": {}, "output_type": "execute_result" } ], "source": [ "lr.fit(x_train,y_train)\n", "lr.score(x_test,y_test)" ] }, { "cell_type": "code", "execution_count": 52, "id": "636ea141-22a8-4338-89ea-da3f47b4c298", "metadata": {}, "outputs": [], "source": [ "from sklearn.model_selection import GridSearchCV,cross_val_score,ShuffleSplit\n", "from sklearn.tree import DecisionTreeRegressor\n", "from sklearn.linear_model import Lasso" ] }, { "cell_type": "code", "execution_count": 53, "id": "a4117694-4aa7-4773-ab21-34b4f3d1516b", "metadata": {}, "outputs": [], "source": [ "choices = {\n", " 'lr':{\n", " 'model':LinearRegression(),\n", " 'params':{\n", " 'normalize':[True,False]\n", " }\n", " },\n", " 'lasso': {\n", " 'model':Lasso(),\n", " 'params':{\n", " 'alpha':[1,2],\n", " 'selection' : ['cyclic', 'random']\n", " }\n", " },\n", " 'tree': {\n", " 'model':DecisionTreeRegressor(),\n", " 'params':{\n", " 'criterion' : [\"mse\", \"friedman_mse\"],\n", " 'splitter' : [\"best\", \"random\"]\n", " }\n", " }\n", "}" ] }, { "cell_type": "code", "execution_count": 54, "id": "41880a33-cc7b-48e1-b650-cbdb32ed9243", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
modelbest_scorebest_params
0lr-7.365020e+14{'normalize': False}
1lasso8.020881e-01{'alpha': 1, 'selection': 'random'}
2tree7.988632e-01{'criterion': 'mse', 'splitter': 'random'}
\n", "
" ], "text/plain": [ " model best_score best_params\n", "0 lr -7.365020e+14 {'normalize': False}\n", "1 lasso 8.020881e-01 {'alpha': 1, 'selection': 'random'}\n", "2 tree 7.988632e-01 {'criterion': 'mse', 'splitter': 'random'}" ] }, "execution_count": 54, "metadata": {}, "output_type": "execute_result" } ], "source": [ "def best_model_param_pair(choices):\n", " cv = ShuffleSplit(n_splits=5,test_size=0.2,random_state=0)\n", " data = []\n", " for mn,mp in choices.items():\n", " gs = GridSearchCV(mp['model'],mp['params'],cv=cv)\n", " gs.fit(x,y)\n", " data.append({\n", " 'model':mn,\n", " 'best_score': gs.best_score_,\n", " 'best_params':gs.best_params_\n", " })\n", " return pd.DataFrame(data)\n", "daaa= best_model_param_pair(choices)\n", "daaa" ] }, { "cell_type": "code", "execution_count": 55, "id": "cc5af773-698a-4e68-b974-5190b4dac88b", "metadata": {}, "outputs": [], "source": [ "def predict(location,bhk,tsqft,bath):\n", " x=ohe.transform([[location]]).toarray()\n", " x=np.append(x[:,1:],np.array([bhk,tsqft,bath]))\n", " print(lr.predict(x.reshape(1,-1)))" ] }, { "cell_type": "code", "execution_count": 56, "id": "fa5a025a-35df-4bed-853d-1aeabbae4583", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "[94.38034082]\n" ] } ], "source": [ "predict('Devarabeesana Halli',2,1100.0,2.0)" ] }, { "cell_type": "code", "execution_count": 57, "id": "6e3dca9e-dd82-4add-80ed-d4ef7b3a2f07", "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ ":1: UserWarning: Boolean Series key will be reindexed to match DataFrame index.\n", " df2[df2.location=='Devarabeesana Halli'][df2.total_sqft==1100]\n" ] }, { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
locationsizetotal_sqftbathprice
2764Devarabeesana Halli21100.02.070.0
\n", "
" ], "text/plain": [ " location size total_sqft bath price\n", "2764 Devarabeesana Halli 2 1100.0 2.0 70.0" ] }, "execution_count": 57, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df2[df2.location=='Devarabeesana Halli'][df2.total_sqft==1100]" ] }, { "cell_type": "code", "execution_count": 58, "id": "3214fdf2-6268-4c8f-9534-e9d106fec8f2", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "[84.99999962]\n" ] } ], "source": [ "predict('1st Block BEL Layout',3,1540.0,3.0)" ] }, { "cell_type": "code", "execution_count": 59, "id": "c5fd9afa-fd72-41c9-8610-f8712d55af35", "metadata": {}, "outputs": [], "source": [ "import pickle\n", "with open('banglore_price_prediction_model.pickle','wb') as f:\n", " pickle.dump(lr,f)" ] }, { "cell_type": "code", "execution_count": 60, "id": "40dc20e3-6872-4eec-80b3-dab5c140e445", "metadata": {}, "outputs": [], "source": [ "with open('location_encoder.pickle','wb') as l:\n", " pickle.dump(ohe,l)" ] }, { "cell_type": "code", "execution_count": 61, "id": "b2cf33b0-97d1-4a53-9972-9d26b4b6c3b7", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "'1st Block BEL Layout'" ] }, "execution_count": 61, "metadata": {}, "output_type": "execute_result" } ], "source": [ "with open('location_encoder.pickle','rb') as lc:\n", " le= pickle.load(lc)\n", "loc = le.categories_[0]\n", "loc[0]" ] }, { "cell_type": "code", "execution_count": 65, "id": "79ba7e3d-9b96-4d22-a2f9-c79f46ba20e3", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,\n", " 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,\n", " 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,\n", " 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,\n", " 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,\n", " 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,\n", " 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,\n", " 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,\n", " 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,\n", " 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,\n", " 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,\n", " 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,\n", " 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,\n", " 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,\n", " 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,\n", " 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,\n", " 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,\n", " 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,\n", " 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,\n", " 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,\n", " 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,\n", " 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,\n", " 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,\n", " 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,\n", " 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,\n", " 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,\n", " 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,\n", " 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,\n", " 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,\n", " 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,\n", " 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,\n", " 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,\n", " 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,\n", " 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,\n", " 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,\n", " 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,\n", " 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,\n", " 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,\n", " 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,\n", " 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,\n", " 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,\n", " 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,\n", " 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,\n", " 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,\n", " 0., 0., 0.])" ] }, "execution_count": 65, "metadata": {}, "output_type": "execute_result" } ], "source": [ "le.transform([['1st Block BEL Layout']]).toarray()[0][1:]" ] }, { "cell_type": "code", "execution_count": null, "id": "ad3fb7ef-6346-414e-9ddd-24d4743830a0", "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.8.8" } }, "nbformat": 4, "nbformat_minor": 5 }