import pandas as pd
import numpy as np
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor

import seaborn as sn
import matplotlib.pyplot as plt

from scipy import stats
import warnings
pd.options.mode.chained_assignment = None
warnings.filterwarnings("ignore", category=DeprecationWarning)


train_df = pd.read_csv('bike_train.csv')
train_df['data_set'] = 'train'
train_df.head(5)


train_df["dteday"]= pd.to_datetime(train_df["dteday"])
train_df["dteday"]=pd.to_datetime(train_df["dteday"]) + pd.to_timedelta(train_df['hr'], unit='h')
train_df.head(5)
train_df.isna().sum()

instant       0
dteday        0
season        0
yr            0
mnth          0
hr            0
holiday       0
weekday       0
workingday    0
weathersit    0
temp          0
atemp         0
hum           0
windspeed     0
casual        0
registered    0
cnt           0
data_set      0
dtype: int64


train_df.dtypes

instant                int64
dteday        datetime64[ns]
season                 int64
yr                     int64
mnth                   int64
hr                     int64
holiday                int64
weekday                int64
workingday             int64
weathersit             int64
temp                 float64
atemp                float64
hum                  float64
windspeed            float64
casual                 int64
registered             int64
cnt                    int64
data_set              object
dtype: object


print(train_df.head(5))

train_df = train_df.drop('data_set',axis=1)
train_df = train_df.drop('instant',axis=1)

   instant              dteday  season  yr  mnth  hr  holiday  weekday  \
0        1 2011-01-01 00:00:00       1   0     1   0        0        6   
1        2 2011-01-01 01:00:00       1   0     1   1        0        6   
2        3 2011-01-01 02:00:00       1   0     1   2        0        6   
3        4 2011-01-01 03:00:00       1   0     1   3        0        6   
4        5 2011-01-01 04:00:00       1   0     1   4        0        6   

   workingday  weathersit  temp   atemp   hum  windspeed  casual  registered  \
0           0           1  0.24  0.2879  0.81        0.0       3          13   
1           0           1  0.22  0.2727  0.80        0.0       8          32   
2           0           1  0.22  0.2727  0.80        0.0       5          27   
3           0           1  0.24  0.2879  0.75        0.0       3          10   
4           0           1  0.24  0.2879  0.75        0.0       0           1   

   cnt data_set  
0   16    train  
1   40    train  
2   32    train  
3   13    train  
4    1    train


categoryVariableList = ["hr","weekday","mnth","season","weathersit","holiday","workingday"]
for var in categoryVariableList:
    train_df[var] = train_df[var].astype("category")


dailyData  = train_df.drop(["dteday"],axis=1)


fig, axes = plt.subplots(nrows=2,ncols=2)
fig.set_size_inches(12, 10)
sn.boxplot(data=train_df,y="cnt",orient="v",ax=axes[0][0])
sn.boxplot(data=train_df,y="cnt",x="season",orient="v",ax=axes[0][1])
sn.boxplot(data=train_df,y="cnt",x="hr",orient="v",ax=axes[1][0])
sn.boxplot(data=train_df,y="cnt",x="workingday",orient="v",ax=axes[1][1])

axes[0][0].set(ylabel='Contados',title="Diagrama de caja de Bicicletas contadas registradas o no registradas")
axes[0][1].set(xlabel='Estaciones', ylabel='Contados',title=" Diagrama de cajas a lo largo de la temporada")
axes[1][0].set(xlabel='Horas del d&iacute;a', ylabel='Contados',title="Diagrama de cajas a lo largo del d&iacute;a")
axes[1][1].set(xlabel='D&iacute;as laborables', ylabel='Contados',title="Diagrama de cajas a lo largo de d&iacute;as laborales")

[Text(0.5, 0, 'D&iacute;as laborables'),
 Text(0, 0.5, 'Contados'),
 Text(0.5, 1.0, 'Diagrama de cajas a lo largo de d&iacute;as laborales')]


train_dfWithoutOutliers = train_df[np.abs(train_df["cnt"]-train_df["cnt"].mean())<=(3*train_df["cnt"].std())]


print ("Forma de los outliers en un inicio: ",train_df.shape)
print ("Forma de los outliers despuÃ©s: ",train_dfWithoutOutliers.shape)

Forma de los outliers en un inicio:  (11999, 16)
Forma de los outliers despuÃ©s:  (11846, 16)


corrMatt = train_df[["temp","atemp","casual","registered","hum","windspeed","cnt"]].corr()
mask = np.array(corrMatt)
mask[np.tril_indices_from(mask)] = False
fig,ax= plt.subplots()
fig.set_size_inches(20,10)
sn.heatmap(corrMatt, mask=mask,vmax=.8, square=True,annot=True)

<AxesSubplot:>


fig,(ax1,ax2,ax3) = plt.subplots(ncols=3)
fig.set_size_inches(12, 5)
sn.regplot(x="temp", y="cnt", data=train_df,ax=ax1)
sn.regplot(x="windspeed", y="cnt", data=train_df,ax=ax2)
sn.regplot(x="hum", y="cnt", data=train_df,ax=ax3)

<AxesSubplot:xlabel='hum', ylabel='cnt'>


categoricalFeatureNames = ["season","holiday","workingday","weathersit","weekday","mnth","yr","hr"]
numericalFeatureNames = ["temp","hum","windspeed","atemp"]
dropFeatures = ['casual',"cnt","dtetime","registered"]


#from sklearn.ensemble import RandomForestRegressor
#rfModel = RandomForestRegressor(n_estimators=25)
#yLabelsLog = np.log1p(yLabels)
#print(yLabelsLog.shape)
#print(dataTrain.shape)
#rfModel.fit(dataTrain,yLabelsLog)
#preds = rfModel.predict(X= dataTrain)
#print ("RMSLE Value For Random Forest: ",rmsle(np.exp(yLabelsLog),np.exp(preds),False))


print(train_df.head(5))

               dteday season  yr mnth hr holiday weekday workingday  \
0 2011-01-01 00:00:00      1   0    1  0       0       6          0   
1 2011-01-01 01:00:00      1   0    1  1       0       6          0   
2 2011-01-01 02:00:00      1   0    1  2       0       6          0   
3 2011-01-01 03:00:00      1   0    1  3       0       6          0   
4 2011-01-01 04:00:00      1   0    1  4       0       6          0   

  weathersit  temp   atemp   hum  windspeed  casual  registered  cnt  
0          1  0.24  0.2879  0.81        0.0       3          13   16  
1          1  0.22  0.2727  0.80        0.0       8          32   40  
2          1  0.22  0.2727  0.80        0.0       5          27   32  
3          1  0.24  0.2879  0.75        0.0       3          10   13  
4          1  0.24  0.2879  0.75        0.0       0           1    1


test_df = pd.read_csv('bike_test.csv')
test_df.head(5)


test_df.head(5)
test_df.isna().sum()

instant       0
dteday        0
season        0
yr            0
mnth          0
hr            0
holiday       0
weekday       0
workingday    0
weathersit    0
temp          0
atemp         0
hum           0
windspeed     0
dtype: int64


print(test_df.head(5))

   instant      dteday  season  yr  mnth  hr  holiday  weekday  workingday  \
0    12000  20/05/2012       2   1     5   4        0        0           0   
1    12001  20/05/2012       2   1     5   5        0        0           0   
2    12002  20/05/2012       2   1     5   6        0        0           0   
3    12003  20/05/2012       2   1     5   7        0        0           0   
4    12004  20/05/2012       2   1     5   8        0        0           0   

   weathersit  temp   atemp   hum  windspeed  
0           1  0.52  0.5000  0.68     0.0896  
1           1  0.50  0.4848  0.72     0.1045  
2           1  0.50  0.4848  0.63     0.1343  
3           1  0.52  0.5000  0.68     0.1940  
4           1  0.56  0.5303  0.56     0.1642


print(test_df.head(5))

   instant      dteday  season  yr  mnth  hr  holiday  weekday  workingday  \
0    12000  20/05/2012       2   1     5   4        0        0           0   
1    12001  20/05/2012       2   1     5   5        0        0           0   
2    12002  20/05/2012       2   1     5   6        0        0           0   
3    12003  20/05/2012       2   1     5   7        0        0           0   
4    12004  20/05/2012       2   1     5   8        0        0           0   

   weathersit  temp   atemp   hum  windspeed  
0           1  0.52  0.5000  0.68     0.0896  
1           1  0.50  0.4848  0.72     0.1045  
2           1  0.50  0.4848  0.63     0.1343  
3           1  0.52  0.5000  0.68     0.1940  
4           1  0.56  0.5303  0.56     0.1642


byday = train_df.groupby('weekday')
byday['casual'].sum().reset_index()


byday['registered'].sum().reset_index()


 #create binary features which show if day is Saturday/Sunday
train_df['Saturday']=0
train_df.Saturday[train_df.weekday==5]=1

train_df['Sunday']=0
train_df.Sunday[train_df.weekday==6]=1
print(train_df.head(5))

               dteday season  yr mnth hr holiday weekday workingday  \
0 2011-01-01 00:00:00      1   0    1  0       0       6          0   
1 2011-01-01 01:00:00      1   0    1  1       0       6          0   
2 2011-01-01 02:00:00      1   0    1  2       0       6          0   
3 2011-01-01 03:00:00      1   0    1  3       0       6          0   
4 2011-01-01 04:00:00      1   0    1  4       0       6          0   

  weathersit  temp   atemp   hum  windspeed  casual  registered  cnt  \
0          1  0.24  0.2879  0.81        0.0       3          13   16   
1          1  0.22  0.2727  0.80        0.0       8          32   40   
2          1  0.22  0.2727  0.80        0.0       5          27   32   
3          1  0.24  0.2879  0.75        0.0       3          10   13   
4          1  0.24  0.2879  0.75        0.0       0           1    1   

   Saturday  Sunday  
0         0       1  
1         0       1  
2         0       1  
3         0       1  
4         0       1


dataRel = train_df.drop(['dteday', 'cnt'], axis=1)


#print(train_df.head(4))
print(train_df['temp'])

0        0.24
1        0.22
2        0.22
3        0.24
4        0.24
         ... 
11994    0.60
11995    0.58
11996    0.56
11997    0.56
11998    0.54
Name: temp, Length: 11999, dtype: float64


#en este codigo se busca encontrar tb. los Nan

test_df.isna().sum()

instant       0
dteday        0
season        0
yr            0
mnth          0
hr            0
holiday       0
weekday       0
workingday    0
weathersit    0
temp          0
atemp         0
hum           0
windspeed     0
dtype: int64


print(test_df.head(5))

   instant      dteday  season  yr  mnth  hr  holiday  weekday  workingday  \
0    12000  20/05/2012       2   1     5   4        0        0           0   
1    12001  20/05/2012       2   1     5   5        0        0           0   
2    12002  20/05/2012       2   1     5   6        0        0           0   
3    12003  20/05/2012       2   1     5   7        0        0           0   
4    12004  20/05/2012       2   1     5   8        0        0           0   

   weathersit  temp   atemp   hum  windspeed  
0           1  0.52  0.5000  0.68     0.0896  
1           1  0.50  0.4848  0.72     0.1045  
2           1  0.50  0.4848  0.63     0.1343  
3           1  0.52  0.5000  0.68     0.1940  
4           1  0.56  0.5303  0.56     0.1642


fig, ax = plt.subplots()
ax.hist(train_df['temp'],edgecolor='yellow', label="Rowing",bins=10)
ax.set_xlabel("Temperatura")
ax.set_ylabel("valores")
plt.title('Distribucici&oacute;n de la temperatura')

fig, ax = plt.subplots(1)
ax.hist(train_df['atemp'], label="Rowing",bins=10,color='black',edgecolor='green')
ax.set_xlabel("grados Celsius")
ax.set_ylabel("valores")
plt.title('Sensaci&oacute;n tÃ©rmica en grados Celsius')


fig, ax = plt.subplots()
ax.hist(train_df['hum'], label="Rowing",bins=10,color='orange',edgecolor='green')
ax.set_xlabel("Humedad")
ax.set_ylabel("valores")
plt.title('Distribucici&oacute;n de la Humedad')

fig, ax = plt.subplots()
ax.hist(train_df['windspeed'], label="Rowing",bins=10,color='green',edgecolor='pink')
ax.set_xlabel("Velocidad del viento")
ax.set_ylabel("valores")
plt.title('Distribucici&oacute;n de la velocidad del viento')

fig, ax = plt.subplots()
ax.hist(train_df['workingday'], label="Rowing",bins=10,color='pink',edgecolor='green')
ax.set_xlabel("D&iacute;as laborables")
ax.set_ylabel("valores")
plt.title('Distribucici&oacute;n de d&iacute;as laborables')
plt.tight_layout();

fig, ax = plt.subplots()
ax.hist(train_df['holiday'], label="Rowing",bins=10,color='grey',edgecolor='green')
ax.set_xlabel("D&iacute;as feriados")
ax.set_ylabel("valores")
plt.title('Distribucici&oacute;n de d&iacute;as feriados')
plt.tight_layout();

fig, ax = plt.subplots()
ax.hist(train_df['season'], label="Rowing",bins=10,color='purple',edgecolor='green')
ax.set_xlabel("Estaci&oacute;n")
ax.set_ylabel("valores")
plt.title('Estaciones en el estado de Washignton')
plt.tight_layout();

fig, ax = plt.subplots()
ax.hist(train_df['weathersit'], label="Rowing",bins=10,color='magenta',edgecolor='green')
ax.set_xlabel("Clima")
ax.set_ylabel("valores")
plt.title('Clima en Washignton D.C.')
plt.tight_layout();


train_df['cnt'].plot.hist(bins=20, figsize=(12,8))

<AxesSubplot:ylabel='Frequency'>


print(train_df['temp'])

0        0.24
1        0.22
2        0.22
3        0.24
4        0.24
         ... 
11994    0.60
11995    0.58
11996    0.56
11997    0.56
11998    0.54
Name: temp, Length: 11999, dtype: float64


rf = RandomForestRegressor(n_estimators=25,
                           random_state=5)
                           
# Fit rf to the training set
X_train=train_df[['windspeed','hum','workingday','weathersit','hr']]
print(X_train)
y_train= train_df['temp']
#print(y_train)
#print(X_train)
grid_rf=(rf.fit(X_train, y_train))
print(grid_rf)

       windspeed   hum workingday weathersit  hr
0         0.0000  0.81          0          1   0
1         0.0000  0.80          0          1   1
2         0.0000  0.80          0          1   2
3         0.0000  0.75          0          1   3
4         0.0000  0.75          0          1   4
...          ...   ...        ...        ...  ..
11994     0.1642  0.56          0          1  23
11995     0.1045  0.53          0          1   0
11996     0.0000  0.52          0          1   1
11997     0.0000  0.52          0          1   2
11998     0.0896  0.56          0          1   3

[11999 rows x 5 columns]
RandomForestRegressor(n_estimators=25, random_state=5)


importances = pd.Series(data=rf.feature_importances_,
                        index= X_train.columns)

# Sort importances
importances_sorted = importances.sort_values()

# Draw a horizontal barplot of importances_sorted
importances_sorted.plot(kind='barh', color='purple')
plt.title('CaracterÃ­sticas Importantes para considerarse')
plt.show()


X_train=train_df[['hum','workingday','weathersit','hr','temp']]
print(X_train)
X_test=test_df[['hum','workingday','weathersit','hr','temp']]

y_train= train_df['windspeed']
y_test= test_df['windspeed']

        hum workingday weathersit  hr  temp
0      0.81          0          1   0  0.24
1      0.80          0          1   1  0.22
2      0.80          0          1   2  0.22
3      0.75          0          1   3  0.24
4      0.75          0          1   4  0.24
...     ...        ...        ...  ..   ...
11994  0.56          0          1  23  0.60
11995  0.53          0          1   0  0.58
11996  0.52          0          1   1  0.56
11997  0.52          0          1   2  0.56
11998  0.56          0          1   3  0.54

[11999 rows x 5 columns]


from sklearn.metrics import mean_squared_error
from math import sqrt
# Predict the test set labels
y_pred = rf.predict(X_test)

# Evaluate the test set RMSE
rmse_test = sqrt(mean_squared_error(y_test, y_pred))

# Print rmse_test
print('Prueba de RMSE rf: {:.2f}'.format(rmse_test))

Prueba de RMSE rf: 0.13


from sklearn.ensemble import RandomForestRegressor
 
rf = RandomForestRegressor().fit(X_train, y_train)
 

print(np.sqrt(mean_squared_error(y_test, y_pred)))

0.12557948913384465

	weekday	casual
0	0	83768
1	1	41792
2	2	34122
3	3	30481
4	4	34358
5	5	45278
6	6	90769

	weekday	registered
0	0	174946
1	1	221902
2	2	242865
3	3	232363
4	4	243990
5	5	239590
6	6	186757

	instant	dteday	season	mnth	hr	weekday	weathersit	temp	atemp	hum	casual	registered	cnt	data_set
0	1	1/1/2011	1	1	0	6	1	0.24	0.2879	0.81	3	13	16	train
1	2	1/1/2011	1	1	1	6	1	0.22	0.2727	0.80	8	32	40	train
2	3	1/1/2011	1	1	2	6	1	0.22	0.2727	0.80	5	27	32	train
3	4	1/1/2011	1	1	3	6	1	0.24	0.2879	0.75	3	10	13	train
4	5	1/1/2011	1	1	4	6	1	0.24	0.2879	0.75	0	1	1	train

	instant	dteday	season	yr	mnth	hr	weathersit	temp	atemp	hum	windspeed
0	12000	20/05/2012	2	1	5	4	1	0.52	0.5000	0.68	0.0896
1	12001	20/05/2012	2	1	5	5	1	0.50	0.4848	0.72	0.1045
2	12002	20/05/2012	2	1	5	6	1	0.50	0.4848	0.63	0.1343
3	12003	20/05/2012	2	1	5	7	1	0.52	0.5000	0.68	0.1940
4	12004	20/05/2012	2	1	5	8	1	0.56	0.5303	0.56	0.1642