In [1]:
import os
import pandas as pd
import numpy as np
import random
from numpy import mean, std
import matplotlib.pylab as plt
from sklearn.linear_model import Perceptron
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.model_selection import RepeatedStratifiedKFold, cross_val_score, GridSearchCV
from skimage.io import imread
from skimage.transform import resize
from sklearn.svm import SVC
from sklearn.decomposition import PCA
from sklearn.metrics import accuracy_score, precision_score, recall_score, confusion_matrix
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.neural_network import MLPClassifier
In [2]:
## convert to grayscale function
def rgb2gray(rgb):
return np.dot(rgb[...,:3], [0.2989, 0.5870, 0.1140])
In [3]:
#imagedir = r'C:\Users\morga\OneDrive\Documents\UW 2023 - 2024\INFO 371\combined-images'
imagedir = '/Users/renee/Downloads/combined' #comment out
#imagedir = 'add your directory here'
In [4]:
## visualize grayscale
# list of image file paths
image_paths = [os.path.join(imagedir, filename) for filename in os.listdir(imagedir) if filename.endswith(".jpg")]
#real_image_path = r'C:\Users\morga\OneDrive\Documents\UW 2023 - 2024\INFO 371\combined-images\real_00130.jpg'
#fake_image_path = r'C:\Users\morga\OneDrive\Documents\UW 2023 - 2024\INFO 371\combined-images\mid_108_1110.jpg'
real_image_path = '/Users/renee/Downloads/combined/real_00130.jpg' #comment out
fake_image_path = '/Users/renee/Downloads/combined/mid_108_1110.jpg' #comment out
#real_image_path = 'add your directory here/real_00130.jpg'
#fake_image_path = 'add your directory here/mid_108_1110.jpg'
real_original_image = imread(real_image_path)
fake_original_image = imread(fake_image_path)
real_gray_image = rgb2gray(real_original_image)
fake_gray_image = rgb2gray(fake_original_image)
fig, axes = plt.subplots(2, 2, figsize=(10, 10))
axes[0, 0].imshow(real_original_image)
axes[0, 0].set_title('Real Original Image')
axes[0, 1].imshow(real_gray_image, cmap='gray')
axes[0, 1].set_title('Real Grayscale Image')
axes[1, 0].imshow(fake_original_image)
axes[1, 0].set_title('Fake Original Image')
axes[1, 1].imshow(fake_gray_image, cmap='gray')
axes[1, 1].set_title('Fake Grayscale Image')
plt.tight_layout()
plt.show()
In [5]:
# Dataset 1: All datapoints included
x_real, y_real = [], []
x_fake, y_fake = [], [] #
for subdir, _, files in os.walk(imagedir):
for filename in files:
if filename.endswith(".jpg"):
image = imread(os.path.join(subdir, filename))
gray_image = rgb2gray(image)
gray_image_resized = resize(gray_image, (100, 100))
flattened_image = gray_image_resized.flatten()
# Assigning label 0 for fake images and label 1 for real images based on file naming convention
if 'real' in filename.lower(): # Label real images
x_real.append(flattened_image)
y_real.append(1) # Label as real
else: # Label fake images based on difficulty level
if 'easy' in filename.lower():
x_fake.append(flattened_image)
y_fake.append(0) # Label as fake
elif 'mid' in filename.lower():
x_fake.append(flattened_image)
y_fake.append(0) # Label as fake
elif 'hard' in filename.lower():
x_fake.append(flattened_image)
y_fake.append(0) # Label as fake
# Convert lists to numpy arrays
x_real, y_real = np.array(x_real), np.array(y_real)
x_fake, y_fake = np.array(x_fake), np.array(y_fake)
# Combine real and fake data and labels
x_all = np.concatenate((x_real, x_fake))
y_all = np.concatenate((y_real, y_fake))
# Perform stratified sampling for 'all' dataset
x_all_train, x_all_test, y_all_train, y_all_test = train_test_split(
x_all, y_all, test_size=0.2, shuffle=True, random_state=42, stratify=y_all)
In [6]:
# PCA testing for combined dataset
pca = PCA(n_components=200, svd_solver='randomized')
pca.fit(np.vstack((x_all_train, x_all_test)))
plt.plot(np.cumsum(pca.explained_variance_ratio_))
plt.xlabel('Number of components')
plt.ylabel('Cumulative Explained Variance');
In [7]:
# Eigenfaces for combined dataset
pca = PCA(n_components=60, svd_solver='randomized')
pca.fit(x_all)
fig, axes = plt.subplots(10, 6, figsize=(20, 10),
subplot_kw={'xticks':[], 'yticks':[]},
gridspec_kw=dict(hspace=0.1, wspace=0.1))
for i, ax in enumerate(axes.flat):
ax.imshow(pca.components_[i].reshape(100, 100), cmap='bone')
plt.show()
In [8]:
# Setting PCA
pca = PCA(n_components=200, svd_solver='randomized')
pca.fit(np.vstack((x_all_train, x_all_test)))
x_all_train_pca = pca.transform(x_all_train)
x_all_test_pca = pca.transform(x_all_test)
In [9]:
# SVC all
train_accuracies_svc_all = []
test_accuracies_svc_all = []
for i in range(3):
m_svc_all = SVC()
parameters_svc_all = [{'gamma': [0.01, 0.001, 0.0001], 'C': [1, 10, 100, 1000]}]
grid_search_svc_all = GridSearchCV(m_svc_all, parameters_svc_all, cv=2)
grid_search_svc_all.fit(x_all_train_pca, y_all_train)
best_estimator_svc_all = grid_search_svc_all.best_estimator_
train_pred_svc_all = best_estimator_svc_all.predict(x_all_train_pca)
test_pred_svc_all = best_estimator_svc_all.predict(x_all_test_pca)
train_accuracy_svc_all = accuracy_score(y_all_train, train_pred_svc_all)
test_accuracy_svc_all = accuracy_score(y_all_test, test_pred_svc_all)
train_accuracies_svc_all.append(train_accuracy_svc_all)
test_accuracies_svc_all.append(test_accuracy_svc_all)
print('Training Accuracy:', mean(train_accuracies_svc_all))
print('Testing Accuracy:', mean(test_accuracies_svc_all))
Training Accuracy: 1.0 Testing Accuracy: 0.530562347188264
In [10]:
# Multi-Layer Perceptron all
train_accuracies_mlp_all = []
test_accuracies_mlp_all = []
for i in range(3):
mlp_all = MLPClassifier()
mlp_parameters_all = [{'activation': ['logistic', 'relu'],
'learning_rate': ['constant', 'adaptive'],
'learning_rate_init': [0.001, 0.01, 0.1],
'max_iter': [800, 1000, 1200, 1400]}]
mlp_grid_search_all = GridSearchCV(mlp_all, mlp_parameters_all, cv=2)
mlp_grid_search_all.fit(x_all_train_pca, y_all_train)
best_estimator_mlp_all = mlp_grid_search_all.best_estimator_
train_pred_mlp_all = best_estimator_mlp_all.predict(x_all_train_pca)
test_pred_mlp_all = best_estimator_mlp_all.predict(x_all_test_pca)
train_accuracy_mlp_all = accuracy_score(y_all_train, train_pred_mlp_all)
test_accuracy_mlp_all = accuracy_score(y_all_test, test_pred_mlp_all)
train_accuracies_mlp_all.append(train_accuracy_mlp_all)
test_accuracies_mlp_all.append(test_accuracy_mlp_all)
print('Training Accuracy:', mean(train_accuracies_mlp_all))
print('Testing Accuracy:', mean(test_accuracies_mlp_all))
Training Accuracy: 0.8845996732026143 Testing Accuracy: 0.5566422167889161
In [11]:
# Data for visualization
model_names = ['SVC with PCA', 'MLPClassifier with PCA']
train_accuracy_scores = [mean(train_accuracies_svc_all), mean(train_accuracies_mlp_all)]
test_accuracy_scores = [mean(test_accuracies_svc_all), mean(test_accuracies_mlp_all)]
# Creating bar plots for accuracy
fig, axs = plt.subplots(2, figsize=(5, 8))
# Training Accuracy for Combined Dataset
axs[0].bar(model_names, train_accuracy_scores, color=['blue', 'green'])
axs[0].set_ylim(0, 1)
axs[0].set_ylabel('Accuracy')
axs[0].set_title('Training Model Accuracy Comparison')
# Testing Accuracy for Combined Dataset
axs[1].bar(model_names, test_accuracy_scores, color=['blue', 'green'])
axs[1].set_ylim(0, 1)
axs[1].set_ylabel('Accuracy')
axs[1].set_title('Testing Model Accuracy Comparison')
# Adjust layout and show plot
plt.tight_layout()
plt.show()
In [12]:
# Checking the number of images with each difficulty
files = os.listdir(imagedir)
image_files_starting_with_easy = [file for file in files if file.lower().startswith('easy') and file.lower().endswith(('jpg'))]
num_easy= len(image_files_starting_with_easy)
print(f'Total number of image files starting with "easy": {num_easy}')
image_files_starting_with_mid = [file for file in files if file.lower().startswith('mid') and file.lower().endswith(('jpg'))]
num_mid = len(image_files_starting_with_mid)
print(f'Total number of image files starting with "mid": {num_mid}')
image_files_starting_with_hard = [file for file in files if file.lower().startswith('hard') and file.lower().endswith(('jpg'))]
num_hard = len(image_files_starting_with_hard)
print(f'Total number of image files starting with "hard": {num_hard}')
Total number of image files starting with "easy": 240 Total number of image files starting with "mid": 480 Total number of image files starting with "hard": 240
In [13]:
# Dataset 2: Easy dataset
x_easy = []
y_easy = []
for subdir, _, files in os.walk(imagedir):
for filename in files:
if filename.endswith(".jpg") & (filename.startswith("easy") | filename.startswith("real")):
image = imread(os.path.join(subdir, filename))
gray_image = rgb2gray(image)
gray_image_resized = resize(gray_image, (100, 100))
flattened_image = gray_image_resized.flatten()
x_easy.append(flattened_image)
# Assigning label 0 for fake images and label 1 for real images based on file naming convention
if any(label in filename.lower() for label in ['easy']):
y_easy.append(0) # Label as fake
else:
y_easy.append(1) # Label as real
x_easy = np.array(x_easy)
y_easy = np.array(y_easy)
# splitting the combined data into training and testing sets
x_easy_train, x_easy_test, y_easy_train, y_easy_test = train_test_split(
x_easy, y_easy, test_size=0.2, shuffle=True, random_state=42, stratify=y_easy)
In [14]:
# Setting PCA
pca = PCA(n_components=200, svd_solver='randomized')
pca.fit(np.vstack((x_easy_train, x_easy_test)))
x_easy_train_pca = pca.transform(x_easy_train)
x_easy_test_pca = pca.transform(x_easy_test)
In [15]:
# SVC easy
train_accuracies_svc_easy = []
test_accuracies_svc_easy = []
for i in range(3):
m_svc_easy = SVC()
parameters_svc_easy = [{'gamma': [0.01, 0.001, 0.0001], 'C': [1, 10, 100, 1000]}]
grid_search_svc_easy = GridSearchCV(m_svc_easy, parameters_svc_easy, cv=2)
grid_search_svc_easy.fit(x_easy_train_pca, y_easy_train)
best_estimator_svc_easy = grid_search_svc_easy.best_estimator_
train_pred_svc_easy = best_estimator_svc_easy.predict(x_easy_train_pca)
test_pred_svc_easy = best_estimator_svc_easy.predict(x_easy_test_pca)
train_accuracy_svc_easy = accuracy_score(y_easy_train, train_pred_svc_easy)
test_accuracy_svc_easy = accuracy_score(y_easy_test, test_pred_svc_easy)
train_accuracies_svc_easy.append(train_accuracy_svc_easy)
test_accuracies_svc_easy.append(test_accuracy_svc_easy)
print('Training Accuracy:', mean(train_accuracies_svc_easy))
print('Testing Accuracy:', mean(test_accuracies_svc_easy))
Training Accuracy: 1.0 Testing Accuracy: 0.8188679245283019
In [16]:
# Multi-Layer Perceptron easy
train_accuracies_mlp_easy = []
test_accuracies_mlp_easy = []
for i in range(3):
mlp_easy = MLPClassifier()
mlp_parameters_easy = [{'activation': ['logistic', 'relu'],
'learning_rate': ['constant', 'adaptive'],
'learning_rate_init': [0.001, 0.01, 0.1],
'max_iter': [1000, 1200, 1400, 1600]}]
mlp_grid_search_easy = GridSearchCV(mlp_easy, mlp_parameters_easy, cv=2)
mlp_grid_search_easy.fit(x_easy_train_pca, y_easy_train)
best_estimator_mlp_easy = mlp_grid_search_easy.best_estimator_
train_pred_mlp_easy = best_estimator_mlp_easy.predict(x_easy_train_pca)
test_pred_mlp_easy = best_estimator_mlp_easy.predict(x_easy_test_pca)
train_accuracy_mlp_easy = accuracy_score(y_easy_train, train_pred_mlp_easy)
test_accuracy_mlp_easy = accuracy_score(y_easy_test, test_pred_mlp_easy)
train_accuracies_mlp_easy.append(train_accuracy_mlp_easy)
test_accuracies_mlp_easy.append(test_accuracy_mlp_easy)
print('Training Accuracy:', mean(train_accuracies_mlp_easy))
print('Testing Accuracy:', mean(test_accuracies_mlp_easy))
Training Accuracy: 0.8112373737373737 Testing Accuracy: 0.791194968553459
In [17]:
# Data for visualization
model_names = ['SVC with PCA', 'MLPClassifier with PCA']
train_accuracy_scores = [mean(train_accuracies_svc_easy), mean(train_accuracies_mlp_easy)]
test_accuracy_scores = [mean(test_accuracies_svc_easy), mean(test_accuracies_mlp_easy)]
# Creating bar plots for accuracy
fig, axs = plt.subplots(2, figsize=(5, 8))
# Training Accuracy for Easy Dataset
axs[0].bar(model_names, train_accuracy_scores, color=['blue', 'green'])
axs[0].set_ylim(0, 1)
axs[0].set_ylabel('Accuracy')
axs[0].set_title('Training Model Accuracy Comparison')
# Testing Accuracy for Easy Dataset
axs[1].bar(model_names, test_accuracy_scores, color=['blue', 'green'])
axs[1].set_ylim(0, 1)
axs[1].set_ylabel('Accuracy')
axs[1].set_title('Testing Model Accuracy Comparison')
# Adjust layout and show plot
plt.tight_layout()
plt.show()
In [18]:
# Dataset 3: Medium dataset
x_mid = []
y_mid = []
for subdir, _, files in os.walk(imagedir):
for filename in files:
if filename.endswith(".jpg") & (filename.startswith("mid") | filename.startswith("real")):
image = imread(os.path.join(subdir, filename))
gray_image = rgb2gray(image)
gray_image_resized = resize(gray_image, (100, 100))
flattened_image = gray_image_resized.flatten()
x_mid.append(flattened_image)
# Assigning label 0 for fake images and label 1 for real images based on file naming convention
if any(label in filename.lower() for label in ['mid']):
y_mid.append(0) # Label as fake
else:
y_mid.append(1) # Label as real
x_mid = np.array(x_mid)
y_mid = np.array(y_mid)
# splitting the combined data into training and testing sets
x_mid_train, x_mid_test, y_mid_train, y_mid_test = train_test_split(
x_mid, y_mid, test_size=0.2, shuffle=True, random_state=42, stratify=y_mid)
In [19]:
# Setting PCA
pca = PCA(n_components=200, svd_solver='randomized')
pca.fit(np.vstack((x_mid_train, x_mid_test)))
x_mid_train_pca = pca.transform(x_mid_train)
x_mid_test_pca = pca.transform(x_mid_test)
In [20]:
# SVC mid
train_accuracies_svc_mid = []
test_accuracies_svc_mid = []
for i in range(3):
m_svc_mid = SVC()
parameters_svc_mid = [{'gamma': [0.01, 0.001, 0.0001], 'C': [1, 10, 100, 1000]}]
grid_search_svc_mid = GridSearchCV(m_svc_mid, parameters_svc_mid, cv=2)
grid_search_svc_mid.fit(x_mid_train_pca, y_mid_train)
best_estimator_svc_mid = grid_search_svc_mid.best_estimator_
train_pred_svc_mid = best_estimator_svc_mid.predict(x_mid_train_pca)
test_pred_svc_mid = best_estimator_svc_mid.predict(x_mid_test_pca)
train_accuracy_svc_mid = accuracy_score(y_mid_train, train_pred_svc_mid)
test_accuracy_svc_mid = accuracy_score(y_mid_test, test_pred_svc_mid)
train_accuracies_svc_mid.append(train_accuracy_svc_mid)
test_accuracies_svc_mid.append(test_accuracy_svc_mid)
print('Training Accuracy:', mean(train_accuracies_svc_mid))
print('Testing Accuracy:', mean(test_accuracies_svc_mid))
Training Accuracy: 1.0 Testing Accuracy: 0.6932907348242812
In [29]:
# Multi-Layer Perceptron mid
train_accuracies_mlp_mid = []
test_accuracies_mlp_mid = []
for i in range(3):
mlp_mid = MLPClassifier()
mlp_parameters_mid = [{'activation': ['logistic', 'relu'],
'learning_rate': ['constant', 'adaptive'],
'learning_rate_init': [0.001, 0.01, 0.1],
'max_iter': [1000, 1200, 1400, 1600]}]
mlp_grid_search_mid = GridSearchCV(mlp_mid, mlp_parameters_mid, cv=2)
mlp_grid_search_mid.fit(x_mid_train_pca, y_mid_train)
best_estimator_mlp_mid = mlp_grid_search_mid.best_estimator_
train_pred_mlp_mid = best_estimator_mlp_mid.predict(x_mid_train_pca)
test_pred_mlp_mid = best_estimator_mlp_mid.predict(x_mid_test_pca)
train_accuracy_mlp_mid = accuracy_score(y_mid_train, train_pred_mlp_mid)
test_accuracy_mlp_mid = accuracy_score(y_mid_test, test_pred_mlp_mid)
train_accuracies_mlp_mid.append(train_accuracy_mlp_mid)
test_accuracies_mlp_mid.append(test_accuracy_mlp_mid)
print('Training Accuracy:', mean(train_accuracies_mlp_mid))
print('Testing Accuracy:', mean(test_accuracies_mlp_mid))
Training Accuracy: 0.6907051282051282 Testing Accuracy: 0.6506922257720981
In [30]:
# Data for visualization
model_names = ['SVC with PCA', 'MLPClassifier with PCA']
train_accuracy_scores = [mean(train_accuracies_svc_mid), mean(train_accuracies_mlp_mid)]
test_accuracy_scores = [mean(test_accuracies_svc_mid), mean(test_accuracies_mlp_mid)]
# Creating bar plots for accuracy
fig, axs = plt.subplots(2, figsize=(5, 8))
# Training Accuracy plot
axs[0].bar(model_names, train_accuracy_scores, color=['blue', 'green'])
axs[0].set_ylim(0, 1)
axs[0].set_ylabel('Accuracy')
axs[0].set_title('Training Model Accuracy Comparison')
# Testing Accuracy plot
axs[1].bar(model_names, test_accuracy_scores, color=['blue', 'green'])
axs[1].set_ylim(0, 1)
axs[1].set_ylabel('Accuracy')
axs[1].set_title('Testing Model Accuracy Comparison')
# Adjust layout and show plot
plt.tight_layout()
plt.show()
In [23]:
# Dataset 4: Hard dataset
x_hard = []
y_hard = []
for subdir, _, files in os.walk(imagedir):
for filename in files:
if filename.endswith(".jpg") & (filename.startswith("hard") | filename.startswith("real")):
image = imread(os.path.join(subdir, filename))
gray_image = rgb2gray(image)
gray_image_resized = resize(gray_image, (100, 100))
flattened_image = gray_image_resized.flatten()
x_hard.append(flattened_image)
# Assigning label 0 for fake images and label 1 for real images based on file naming convention
if any(label in filename.lower() for label in ['hard']):
y_hard.append(0) # Label as fake
else:
y_hard.append(1) # Label as real
x_hard = np.array(x_hard)
y_hard = np.array(y_hard)
# splitting the combined data into training and testing sets
x_hard_train, x_hard_test, y_hard_train, y_hard_test = train_test_split(
x_hard, y_hard, test_size=0.2, shuffle=True, random_state=42, stratify=y_hard)
In [24]:
# Setting PCA
pca = PCA(n_components=200, svd_solver='randomized')
pca.fit(np.vstack((x_hard_train, x_hard_test)))
x_hard_train_pca = pca.transform(x_hard_train)
x_hard_test_pca = pca.transform(x_hard_test)
In [25]:
# SVC hard
train_accuracies_svc_hard = []
test_accuracies_svc_hard = []
for i in range(3):
m_svc_hard = SVC()
parameters_svc_hard = [{'gamma': [0.01, 0.001, 0.0001], 'C': [1, 10, 100, 1000]}]
grid_search_svc_hard = GridSearchCV(m_svc_hard, parameters_svc_hard, cv=2)
grid_search_svc_hard.fit(x_hard_train_pca, y_hard_train)
best_estimator_svc_hard = grid_search_svc_hard.best_estimator_
train_pred_svc_hard = best_estimator_svc_hard.predict(x_hard_train_pca)
test_pred_svc_hard = best_estimator_svc_hard.predict(x_hard_test_pca)
train_accuracy_svc_hard = accuracy_score(y_hard_train, train_pred_svc_hard)
test_accuracy_svc_hard = accuracy_score(y_hard_test, test_pred_svc_hard)
train_accuracies_svc_hard.append(train_accuracy_svc_hard)
test_accuracies_svc_hard.append(test_accuracy_svc_hard)
print('Training Accuracy:', mean(train_accuracies_svc_hard))
print('Testing Accuracy:', mean(test_accuracies_svc_hard))
Training Accuracy: 1.0 Testing Accuracy: 0.8188679245283019
In [26]:
# Multi-Layer Perceptron hard
train_accuracies_mlp_hard = []
test_accuracies_mlp_hard = []
for i in range(3):
mlp_hard = MLPClassifier()
mlp_parameters_hard = [{'activation': ['logistic', 'relu'],
'learning_rate': ['constant', 'adaptive'],
'learning_rate_init': [0.001, 0.01, 0.1],
'max_iter': [1000, 1200, 1400, 1600]}]
mlp_grid_search_hard = GridSearchCV(mlp_hard, mlp_parameters_hard, cv=2)
mlp_grid_search_hard.fit(x_hard_train_pca, y_hard_train)
best_estimator_mlp_hard = mlp_grid_search_hard.best_estimator_
train_pred_mlp_hard = best_estimator_mlp_hard.predict(x_hard_train_pca)
test_pred_mlp_hard = best_estimator_mlp_hard.predict(x_hard_test_pca)
train_accuracy_mlp_hard = accuracy_score(y_hard_train, train_pred_mlp_hard)
test_accuracy_mlp_hard = accuracy_score(y_hard_test, test_pred_mlp_hard)
train_accuracies_mlp_hard.append(train_accuracy_mlp_hard)
test_accuracies_mlp_hard.append(test_accuracy_mlp_hard)
print('Training Accuracy:', mean(train_accuracies_mlp_hard))
print('Testing Accuracy:', mean(test_accuracies_mlp_hard))
Training Accuracy: 0.8074494949494949 Testing Accuracy: 0.7899371069182392
In [27]:
# Data for visualization
model_names = ['SVC with PCA', 'MLPClassifier with PCA']
train_accuracy_scores = [mean(train_accuracies_svc_hard), mean(train_accuracies_mlp_hard)]
test_accuracy_scores = [mean(test_accuracies_svc_hard), mean(test_accuracies_mlp_hard)]
# Creating bar plots for accuracy
fig, axs = plt.subplots(2, figsize=(5, 8))
# Training Accuracy plot
axs[0].bar(model_names, train_accuracy_scores, color=['blue', 'green'])
axs[0].set_ylim(0, 1)
axs[0].set_ylabel('Accuracy')
axs[0].set_title('Training Model Accuracy Comparison')
# Testing Accuracy plot
axs[1].bar(model_names, test_accuracy_scores, color=['blue', 'green'])
axs[1].set_ylim(0, 1)
axs[1].set_ylabel('Accuracy')
axs[1].set_title('Testing Model Accuracy Comparison')
# Adjust layout and show plot
plt.tight_layout()
plt.show()