Lab 1: Introduction to Python

Lab 1: Introduction to Python#

Loading data#

Boston data#

import pandas as pd
import numpy as np

data_url = "http://lib.stat.cmu.edu/datasets/boston"
raw_df = pd.read_csv(data_url, sep="\s+", skiprows=22, header=None)
data = np.hstack([raw_df.values[::2, :], raw_df.values[1::2, :2]])
target = raw_df.values[1::2, 2]

data dimension#

print(data.shape)

(506, 13)

subset of data#

data[:,1]
data[1:3,0:2]

array([[0.02731, 0.     ],
       [0.02729, 0.     ]])

Iris data#

from sklearn.datasets import load_iris
iris = load_iris()
print(iris.data.shape)
print(iris.target_names)

(150, 4)
['setosa' 'versicolor' 'virginica']

The first two features#

X = iris.data[:, :2]
y = iris.target

Plot the first two features#

import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
from sklearn.decomposition import PCA

plt.figure(2, figsize=(8, 6))
plt.clf()

plt.scatter(X[:, 0], X[:, 1], c=y, cmap=plt.cm.Set1,
            edgecolor='k')
plt.xlabel('Sepal length')
plt.ylabel('Sepal width')

x_min, x_max = X[:, 0].min() - .5, X[:, 0].max() + .5
y_min, y_max = X[:, 1].min() - .5, X[:, 1].max() + .5
plt.xlim(x_min, x_max)
plt.ylim(y_min, y_max)
plt.xticks(())
plt.yticks(())

plt.show()

_images/e4306c9da1b033b89fa9a127f45111d334531e580cbdd4399cee2e6131d18180.png

Plot the first three PCA dimensions#

# Create a figure
fig = plt.figure()

# Add a 3D subplot
ax = fig.add_subplot(111, projection='3d')
X_reduced = PCA(n_components=3).fit_transform(iris.data)
ax.scatter(X_reduced[:, 0], X_reduced[:, 1], X_reduced[:, 2], c=y,
           cmap=plt.cm.Set1, edgecolor='k', s=40)
ax.set_title("First three PCA directions")
ax.set_xlabel("1st eigenvector")
ax.set_ylabel("2nd eigenvector")
ax.set_zlabel("3rd eigenvector")

plt.show()

_images/8c6726e220e575448b9cf0539947b9eb03f03dab374392a368433ab7066eba56.png

Digit data#

from sklearn.datasets import load_digits
digits = load_digits()
print(digits.data.shape)
print(digits.target)

(1797, 64)
[0 1 2 ... 8 9 8]

Plot an image#

import matplotlib.pyplot as plt 
plt.gray() 
plt.matshow(digits.images[17]) 
plt.show()

<Figure size 640x480 with 0 Axes>

_images/a54763a9ca7ee72af9642b2f0419e8ac80f766c6e34fefb62751aeba4fb623dc.png

Simulating data#

Generate random numbers [0,1]#

from random import seed
from random import random

seed(14)
for _ in range(10):
    value = random()
    print(value)

10682853770165568
7025855239868555
6520420203142754
9403523895661179
27111522656032316
25577551343303917
7340593641446967
6584500182400758
3029879738883551
6842331280769555

Generate random integers#

from random import seed
from random import randint
# seed random number generator
seed(1)
# generate some integers
for _ in range(10):
    value = randint(0, 10)
    print(value)

Generating a random sample without replacement#

# select a random sample without replacement
from random import seed
from random import sample
# seed random number generator
seed(1)
# prepare a sequence
sequence = [i for i in range(20)]
print(sequence)
# select a subset without replacement
subset = sample(sequence, 5)
print(subset)

[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19]
[4, 18, 2, 8, 3]

Generating random numbers from distributions#

import random

# seed random number generator
random.seed(1)

# generate some Gaussian values
print("Normal distribution")
for _ in range(10):
    value = random.gauss(0, 1)
    print(value)

# generate uniform    
print("\nUniform")
for _ in range(10):
    value = random.uniform(0, 1)
    print(value)

# generate exponential    
print("\nExponential")
for _ in range(10):
    value = random.expovariate(10)
    print(value)   
    
# generate Gamma  
print("\nGamma")
value = list(range(10))
for i in range(10):
    value[i] = random.gammavariate(1,10)
print(value) 

# generate multivariate normal
print("\nMultivariate normal")
import numpy as np
import matplotlib.pyplot as plt 
from scipy.stats import multivariate_normal

rmvn = np.array([x[:] for x in [[0.1]*2]*10])
for i in range(10):
    rmvn[i,] = multivariate_normal.rvs(mean = [0.5, -0.2], cov=[[2.0, 0.3], [0.3, 0.5]])
print(rmvn)

plt.scatter(rmvn[:,0], rmvn[:,1], s= 30*(rmvn[:,0]**2+rmvn[:,1]**2), c="red", alpha=0.5)

Generate 2D classification points#

from sklearn.datasets import make_blobs
from matplotlib import pyplot
from pandas import DataFrame
# generate 2d classification dataset
X, y = make_blobs(n_samples=100, centers=3, n_features=2)
# scatter plot, dots colored by class value
df = DataFrame(dict(x=X[:,0], y=X[:,1], label=y))
colors = {0:'red', 1:'blue', 2:'green'}
fig, ax = pyplot.subplots()
grouped = df.groupby('label')
for key, group in grouped:
    group.plot(ax=ax, kind='scatter', x='x', y='y', label=key, color=colors[key])
pyplot.show()

Generating circle data for classification#

from sklearn.datasets import make_circles
from matplotlib import pyplot
from pandas import DataFrame
# generate 2d classification dataset
X, y = make_circles(n_samples=100, noise=0.05)
# scatter plot, dots colored by class value
df = DataFrame(dict(x=X[:,0], y=X[:,1], label=y))
colors = {0:'red', 1:'blue'}
fig, ax = pyplot.subplots()
grouped = df.groupby('label')
for key, group in grouped:
    group.plot(ax=ax, kind='scatter', x='x', y='y', label=key, color=colors[key])
pyplot.show()

Lab 1: Introduction to Python

Contents

Lab 1: Introduction to Python#

Loading data#

Boston data#

data dimension#

subset of data#

Iris data#

The first two features#

Plot the first two features#

Plot the first three PCA dimensions#

Digit data#

Plot an image#

Simulating data#

Generate random numbers [0,1]#

Generate random integers#

Generating a random sample without replacement#

Generating random numbers from distributions#

Generate 2D classification points#

Generating circle data for classification#