from datascience import *
import numpy as np
from math import *
import math as math
import scipy.stats
%matplotlib inline
import matplotlib.pyplot as plots
plots.style.use('fivethirtyeight')
import pandas as pd
from IPython.display import *


            
              covid_county = Table().read_table('us-counties.csv').drop(0,6,7,8,9)
covid_county.show(5)


            
              pop_county = Table().read_table('county-population.csv').select(7,5)
pop_county.show(5)


            
              elect_county = Table().read_table('county-elections.csv').select(1,7,8).relabel(0,'fips')
elect_county.show(5)


            
              complete = Table().read_table('county_complete.csv').drop(3,4,5,6,7,8,9,10).select(0,1,2,3,'median_age_2019', 'white_2019','hs_grad_2019','bachelors_2019','median_household_income_2019','poverty_2019','unemployment_rate_2019','white_2019').drop(1,2)
complete.show(5)


            
              pop_county.join('fips', complete.select(0,1)).sample(10).show(10)


            
              county = covid_county
county = county.where('cases', are.above_or_equal_to(0)).where('deaths', are.above_or_equal_to(0)).where('fips', are.below(60000))
counties_1 = county.join('fips', elect_county)
counties = counties_1.join('fips',complete)
counties.show(5)


            
              counties = counties.drop('population')
counties = counties.relabel('pop2017','pop').relabel('median_age_2019', 'median age').relabel('white_2019','white').relabel('hs_grad_2019','hs grad').relabel('bachelors_2019','bachelors')
counties = counties.relabel('median_household_income_2019','median household income').relabel('poverty_2019','poverty').relabel('unemployment_rate_2019','unemployment')
counties.show(5)


            
              def fix_votes(x):
    """Return proportion in rouned percentage form"""
    return round(x*100,3)
counties['trump'] = counties.apply(fix_votes,'per_gop')
counties['biden'] = counties.apply(fix_votes,'per_dem')
data_1 = counties.drop('poverty', 'per_gop', 'per_dem')
data = data_1.select(0,1,2,5,3,4,12,13,10,11,6,7,8,9).relabel('unemployment', 'unemployed')
data = data.with_columns('cases1',data.column('cases')/data.column('pop'),'deaths1',data.column('deaths')/data.column('pop')).drop('cases', 'deaths')
data = data.relabel('cases1','cases').relabel( 'deaths1', 'deaths')
c = data.apply(fix_votes, 'cases')
d = data.apply(fix_votes, 'deaths')
data = data.drop(12,13).with_columns('cases', c, 'deaths', d)
data = data.select(0,1,2,3,'cases', 'deaths', 4,5,6,7,8,9,10,11)
data.show(5)


            
              data.show(5)


            
              data.to_csv('politics_data.csv')


            
              def vote(x,y):
    """Return plurality winner in Presidential Election"""
    if x>y:
        return 'Trump'
    elif y>x:
        return 'Biden'
    else:
        return 'Tie'
votes = data.select('trump', 'biden')
winner = votes.apply(vote,0,1)
fips = data.column('fips')


            
              cat = data.select(0,1,2).with_column('winner', winner)
num = data.select(3,4,5,8,9,10,11,12,13)


            
              def s_u(array):
    """Return array in standard units"""
    return (array-np.mean(array))/np.std(array)
def standardize(table):
    """Return a table in standard units"""
    t = Table()
    for column in np.array(table.labels):
        col = s_u(table.column(column))
        t = t.with_column(column,col)
    return t


            
              stan_num = standardize(num)
stan_num_fips = stan_num.with_column('fips', fips)
full = cat.join('fips', stan_num_fips)
full.show(5)


            
              shuf_full = full.sample(with_replacement = False)
training = shuf_full.take(np.arange(1606))
test = shuf_full.take(np.arange(1606,3106))


            
              def distance(pt1, pt2):
    """Return the distance between two points, represented as arrays"""
    return np.sqrt(sum((pt1 - pt2)**2))

def row_distance(row1, row2):
    """Return the distance between two numerical rows of a table"""
    return distance(np.array(row1), np.array(row2))

def distances(training, example):
    """Return training table with distances column"""
    distances = make_array()
    attributes_only = training.drop('winner',0,1,2)    
    for row in attributes_only.rows:
        distances = np.append(distances, row_distance(row, example))
    return training.with_column('Distance', distances)

def closest(training, example, k):
    """
    Return a table of the k closest neighbors to example
    """
    return distances(training, example).sort('Distance').take(np.arange(k))

def majority_class(topk):
    """Return the class with the highest count"""
    return topk.group('winner').sort('count', descending=True).column(0).item(0)

def classify_1(training, example, k):
    """Return the majority class among the k nearest neighbors of example"""
    return majority_class(closest(training, example, k))


            
              def accuracy(training, test, k):
    """Return the proportion of correctly classified examples 
    in the test set"""
    test_attributes = test.drop('winner',0,1,2)
    num_correct = 0
    for i in np.arange(test.num_rows):
        c = classify_1(training, test_attributes.row(i), k)
        num_correct = num_correct + (c == test.column('winner').item(i))
    return (num_correct / test.num_rows)*100


            
              accuracy(training, test, 13)

92.6


            
              def classify(example):
    return classify_1(full, example ,13)


            
              def predict(state, county):
    ex = full.where('state', state).where('county', county)
    example = ex.drop(0,1,2,3).row(0)
    result = classify(example)
    print('My algorithm predicts that', result, 'won the 2020 Presidential election in', county,',',state)
    return full.where('state',state).where('county',county).select(0,2,1,3)


            
              predict('California','Yuba')

My algorithm predicts that Trump won the 2020 Presidential election in Yuba , California


            
              predict('Texas','Frio')

My algorithm predicts that Trump won the 2020 Presidential election in Frio , Texas


            
              predict('New York','Nassau')

My algorithm predicts that Biden won the 2020 Presidential election in Nassau , New York


            
              predict('Nebraska','Brown')

My algorithm predicts that Trump won the 2020 Presidential election in Brown , Nebraska


            
              predict('Illinois','Lee')

My algorithm predicts that Trump won the 2020 Presidential election in Lee , Illinois


            
              predict('Mississippi','Bolivar')

My algorithm predicts that Biden won the 2020 Presidential election in Bolivar , Mississippi

county	state	fips	cases	deaths
Autauga	Alabama	1001	7150	111
Baldwin	Alabama	1003	21661	311
Barbour	Alabama	1005	2337	59
Bibb	Alabama	1007	2665	64
Blount	Alabama	1009	6887	139

fips	2015 POPULATION
1001	55,347
1003	203,709
1005	26,489
1007	22,583
1009	57,673

fips	per_gop	per_dem
1001	0.714368	0.270184
1003	0.761714	0.22409
1005	0.534512	0.457882
1007	0.784263	0.206983
1009	0.895716	0.0956938

fips	pop2017	median_age_2019	white_2019	hs_grad_2019	bachelors_2019	median_household_income_2019	poverty_2019	unemployment_rate_2019
1001	55504	38.2	76.8	88.5	26.6	58731	15.2	3.5
1003	212628	43	86.2	90.8	31.9	58320	10.4	4
1005	25270	40.4	46.8	73.2	11.6	32525	30.7	9.4
1007	22668	40.9	76.8	79.1	10.4	47542	nan	7
1009	58013	40.7	95.5	80.5	13.1	49358	13.6	3.1

fips	2015 POPULATION	pop2017
40047	63,569	61581
5137	12,456	12537
16047	15,284	15124
46045	3,999	3919
51041	335,687	343599
40001	22,004	21909
1055	103,057	102755
48121	780,612	836210
15009	164,637	166260
47085	18,135	18484

COVID-19, Demographics, and Political Affiliation¶

A Machine Learning Project focused on the 2020 Presidential Election¶

by Jonathan Ferrari ¶

Introduction¶

Abstract¶

Set-up¶

The Data¶

Conclusion and Discussion¶

fips	county	state	winner	pop	cases	deaths	median household income	unemployed	median age	white	hs grad	bachelors
1001	Autauga	Alabama	Trump	-0.141733	0.909967	-0.0287858	0.383516	-0.550973	-0.608552	-0.399307	0.251456	0.487353
1003	Baldwin	Alabama	Trump	0.338787	-0.0038243	-0.510629	0.354379	-0.354464	0.283253	0.176396	0.618838	1.04212
1005	Barbour	Alabama	Trump	-0.234195	-0.32221	0.265674	-1.47431	1.76784	-0.199808	-2.23666	-2.19243	-1.08275
1007	Bibb	Alabama	Trump	-0.242152	0.528514	0.702903	-0.409707	0.824593	-0.106912	-0.399307	-1.25002	-1.20836
1009	Blount	Alabama	Trump	-0.134059	0.567168	0.328135	-0.280965	-0.708181	-0.14407	0.745974	-1.02639	-0.925743

COVID-19, Demographics, and Political Affiliation¶

A Machine Learning Project focused on the 2020 Presidential Election¶

by Jonathan Ferrari¶

Introduction¶

Abstract¶

Set-up¶

The Data¶

Conclusion and Discussion¶

by Jonathan Ferrari ¶