import numpy as np
import pandas as pd
import random


# For loop using a list we create within the loop
for x in ['Apples','Bananas','Oranges','Pears','Pineapples','Mangoes','Grapefruits','Cantaloupes']:
    print('I like to eat '+x +'.')

I like to eat Apples.
I like to eat Bananas.
I like to eat Oranges.
I like to eat Pears.
I like to eat Pineapples.
I like to eat Mangoes.
I like to eat Grapefruits.
I like to eat Cantaloupes.


# For loop using a list that already exists
list1 = list(random.sample(range(1,100),50))

# If you do not specify print here, the loop will still perform, but you will not see an output
for i in list1:
    print(i*2)

176
4
160
36
114
78
32
168
60
110
186
38
34
46
166
152
150
12
188
118
170
18
154
72
136
120
148
198
100
22
54
194
104
50
42
52
180
156
140
48
44
184
112
96
122
134
172
182
98
86


# For loop to using a range in order to add values to a list

# Creating a list
list2 = []

# For loop that says for every i from 0 to 9, multiply each value by 10 and append it to the list created. This is similar to the code used in R where you name the object and place a [i] next to the name of it
for i in range(0,10):
    list2.append(i*10)

print(list2)

[0, 10, 20, 30, 40, 50, 60, 70, 80, 90]


# Using a while loop to append a list

# Creating an object
x = 5
# Creating a list
y = []

# Telling python while the object is smaller than 50, append the object, x*10, to the list and add 5 to the initial object each time this task is successfully completed
while x < 50:
    y.append(x*10)
    print(x)
    x=x+5

print(y)

5
10
15
20
25
30
35
40
45
[50, 100, 150, 200, 250, 300, 350, 400, 450]


# Basic If-Else statement
x = 0

if x!=0:
    print(1/x)
else:
    print('No reciprocal for 0.')

No reciprocal for 0.


# Basic If-Else statement with different x value
x = 2

if x!=0:
    print(1/x)
else:
    print('No reciprocal for 0.')

0.5


# Creating an object to represent Democratic presidential vote in a general election
district_vote = 53

if district_vote in range(40,61):
    print('This district is competitive')
elif district_vote in range(0,40):
    print('This is a safe Republican district')
else: print('This is a safe Democratic district')

This district is competitive


# Creating an object to represent Democratic presidential vote in a general election
district_vote = 61

if district_vote in range(40,61):
    print('This district is competitive')
elif district_vote in range(0,40):
    print('This is a safe Republican district')
else: print('This is a safe Democratic district')

This is a safe Democratic district


# Using the else argument within a while loop
counter = 0

while counter < 10: # part of while loop
    # loop will end/break once counter hits 10
    if counter == 10: # part of if-else
        break
    print('Inside loop') # part of while loop
    counter = counter + 1
else: # part of if-else
    print('Inside else')

Inside loop
Inside loop
Inside loop
Inside loop
Inside loop
Inside loop
Inside loop
Inside loop
Inside loop
Inside loop
Inside else


## Converting an existing list to a one dimensional numeric array

# Creating a list
list1 = [1,2,3,4,5,6,7,8,9,10]

# Creating an object that converts a list into an array
array1 = np.array(list1)
# Confirming that it is indeed an array with the type() function; conversely, look at the variables window
type(array1)

numpy.ndarray


## Creating a one dimensional numeric array from scratch

array1 = np.array([1,2,3,4,5,6,7,8,9,10])
print(array1)
print(type(array1))

[ 1  2  3  4  5  6  7  8  9 10]
<class 'numpy.ndarray'>


## Creating a one dimensional string array
array1 = np.array(["Hi","Hola","Salut","Ciao","Privet","Hallo","Oi","Anyoung","Ahlan","Hej","Hoi"])

print(array1)
print(type(array1))

['Hi' 'Hola' 'Salut' 'Ciao' 'Privet' 'Hallo' 'Oi' 'Anyoung' 'Ahlan' 'Hej'
 'Hoi']
<class 'numpy.ndarray'>


## Creating a two-dimensional numeric array

# Brackets tell python to separate the arrays and make them two-dimensional
array1 = np.array([[1,2,3,4,5,6,7,8,9,10],
                  [11,12,13,14,15,16,17,18,19,20],
                   [21,22,23,24,25,26,27,28,29,30]])
print(array1)

[[ 1  2  3  4  5  6  7  8  9 10]
 [11 12 13 14 15 16 17 18 19 20]
 [21 22 23 24 25 26 27 28 29 30]]


array1 =np.array([[True, False, 'hello'],
                  ['apple', 33.7, (0,1)],
                  [37,40,50]], dtype=object)
print(array1)

[[True False 'hello']
 ['apple' 33.7 (0, 1)]
 [37 40 50]]


array1 = np.array([[1,2,3,4,5,6,7,8,9,10],
                   [11,12,13,14,15,16,17,18,19,20],
                   [21,22,23,24,25,26,27,28,29,30]])
# Indexing the number that is on the first row, third column
print(array1[0,2])

# Indexing the number that is on the third row, sixth column
print(array1[2,5])

# Indexing multiple values - first three values in first row | Python does not include the final value to give it
print(array1[0,0:3])

# Indexing multiple values - first values from each row | Python does not include the final value to give it
print(array1[0:3,0])

# Indexing all values from the array
print(array1[:,:])

3
26
[1 2 3]
[ 1 11 21]
[[ 1  2  3  4  5  6  7  8  9 10]
 [11 12 13 14 15 16 17 18 19 20]
 [21 22 23 24 25 26 27 28 29 30]]


# Creating array the way we've been doing so
array1 = np.array([[1,2,3,4,5,6],
                   [7,8,9,10,11,12],
                   [13,14,15,16,17,18]])

# Creating an array using numpy function arange and reshape functions
# You can use multiple functions by simply adding a period '.' so long as it makes sense to do so
# the agument is as follows (first number to start on, number to stop at (it will not actually include it), and by how many integers to skip by)
# very similar to the `seq()` function in R
array1 = np.arange(1,19,1).reshape(3,6)

print(array1)

[[ 1  2  3  4  5  6]
 [ 7  8  9 10 11 12]
 [13 14 15 16 17 18]]


## Finding the mean of the array above

# Entire array
print('The entire mean of the array is:', np.mean(array1))

# Mean of each column in the array - we should get an array of six values
print('The mean of each column in the array is:', np.mean(array1, axis=0))

# Mean of each row in the array - we should get an array of three values
print('The mean of each row in the array is:', np.mean(array1, axis=1))

The entire mean of the array is: 9.5
The mean of each column in the array is: [ 7.  8.  9. 10. 11. 12.]
The mean of each row in the array is: [ 3.5  9.5 15.5]


## Finding the sum of the array

# Entire array
print('The sum of the array is:', np.sum(array1))

# Sum of each column in the array - we should get an array of six values
print('The sum of each column in the array is:', np.sum(array1, axis=0))

# Sum of each row in the array - we should get an array of three values
print('The sum of each row in the array is:', np.sum(array1, axis=1))

The sum of the array is: 171
The sum of each column in the array is: [21 24 27 30 33 36]
The sum of each row in the array is: [21 57 93]


## Finding the median of the array

# Entire array
print('The median of the array is:', np.median(array1))

# Std of each column in the array - we should get an array of six values
print('The median of each column in the array is:', np.median(array1, axis=0))

# Std of each row in the array - we should get an array of three values
print('The median of each row in the array is:', np.median(array1, axis=1))

The median of the array is: 9.5
The median of each column in the array is: [ 7.  8.  9. 10. 11. 12.]
The median of each row in the array is: [ 3.5  9.5 15.5]


print(np.sort(array1))

[[ 1  2  3  4  5  6]
 [ 7  8  9 10 11 12]
 [13 14 15 16 17 18]]


# Creating random sample of floats from 0 to 1
array1 = np.random.random(size=1000)
print(array1)

[0.97627021 0.38309446 0.25499555 0.58270088 0.82497305 0.02122969
 0.27241226 0.56008787 0.85120352 0.30294183 0.25246853 0.51334491
 0.91584851 0.85497116 0.69080214 0.14128876 0.69823235 0.13421519
 0.66977104 0.63136291 0.27551535 0.5950138  0.59181872 0.20544113
 0.27873137 0.60652871 0.58749775 0.31402711 0.80982328 0.09227962
 0.50970589 0.0625344  0.99560898 0.92270541 0.35062522 0.05777772
 0.18518874 0.318776   0.88811038 0.90102985 0.96966345 0.93193085
 0.07974178 0.4652216  0.14235956 0.42939359 0.45242035 0.44842215
 0.08704057 0.10691474 0.98002521 0.17257994 0.57765974 0.47999896
 0.72143967 0.49225096 0.22571088 0.65860985 0.50529392 0.07405244
 0.88917405 0.1250901  0.65035723 0.55973857 0.80880092 0.82469485
 0.62721246 0.11085745 0.78218633 0.66928032 0.06887232 0.41627475
 0.92147509 0.75820361 0.16593199 0.01656872 0.790021   0.23314511
 0.85726316 0.8669963  0.18445505 0.64960557 0.74682493 0.71149312
 0.06481823 0.59685585 0.17614153 0.42592499 0.88373472 0.09152846
 0.00844117 0.27010028 0.43800162 0.54936437 0.67748247 0.27682187
 0.33694347 0.95828209 0.14875341 0.45057573 0.28516001 0.93378005
 0.40833126 0.98954465 0.80115279 0.66478701 0.32592395 0.92931862
 0.44012308 0.48193963 0.3343126  0.61132763 0.49250881 0.9478416
 0.82527448 0.40733239 0.57891551 0.3813822  0.9983697  0.18649795
 0.25450315 0.98320728 0.94241146 0.17016917 0.01497714 0.9038444
 0.01144721 0.04944794 0.65830562 0.41368935 0.46464687 0.084506
 0.63186313 0.23918502 0.6777908  0.77062789 0.46256054 0.47624548
 0.49909524 0.33158468 0.54621209 0.34337081 0.09127537 0.89565185
 0.91442465 0.89836315 0.91790014 0.67671573 0.51759743 0.56554337
 0.19774321 0.83139021 0.537344   0.4631443  0.39030677 0.60466604
 0.40626883 0.77769563 0.89453434 0.54903405 0.9111411  0.21591801
 0.33491019 0.92850532 0.70667618 0.88779592 0.91816351 0.79508558
 0.55886237 0.65324    0.16907157 0.34114917 0.63753462 0.2609018
 0.9963522  0.47246108 0.63888808 0.35587114 0.19238365 0.04220245
 0.00164963 0.89794456 0.73706272 0.58470941 0.90415335 0.28566708
 0.094326   0.53842253 0.02894598 0.91192407 0.49119101 0.80394756
 0.8964493  0.04366871 0.36596144 0.91219998 0.14033931 0.45564249
 0.54063369 0.21726109 0.49438253 0.18329261 0.46413495 0.80900482
 0.00143354 0.26243563 0.52337723 0.92871733 0.4484276  0.45958254
 0.5461441  0.79807049 0.03831836 0.29969186 0.66487629 0.50825987
 0.54241008 0.45940555 0.71447737 0.99657444 0.06795103 0.61425951
 0.50588926 0.74418821 0.20289386 0.84133376 0.11174146 0.56959562
 0.62279868 0.76720223 0.47659943 0.01988848 0.32352417 0.17518845
 0.87563408 0.30363849 0.26587719 0.80895701 0.15220885 0.80287239
 0.467848   0.94142355 0.30776692 0.7836972  0.10712317 0.18123063
 0.26636553 0.01229795 0.19126878 0.0322219  0.99841755 0.56322017
 0.50833969 0.78030572 0.58062062 0.2624577  0.70146862 0.45482145
 0.20105458 0.34233579 0.12698766 0.65734517 0.65597898 0.67247935
 0.61514359 0.91827737 0.09473715 0.09946518 0.11770283 0.6214217
 0.8039653  0.93967492 0.2285751  0.0822062  0.949448   0.69294463
 0.05758043 0.37132633 0.38690214 0.54633728 0.51282646 0.68826239
 0.40530248 0.24061713 0.92807071 0.23581036 0.473382   0.23141671
 0.92107698 0.18686933 0.72816822 0.61111892 0.59532419 0.74206107
 0.31413648 0.99311118 0.2753523  0.95975174 0.91817997 0.05392088
 0.35320811 0.55410961 0.26191175 0.60152865 0.38262113 0.19141735
 0.01728481 0.62978151 0.80149892 0.06971416 0.36454169 0.17059088
 0.6753554  0.95888163 0.94718498 0.59042886 0.88485792 0.87372466
 0.77121537 0.85446756 0.09077021 0.70466036 0.86173148 0.61380469
 0.52745375 0.71391818 0.39158902 0.95278057 0.92709163 0.31703839
 0.52164105 0.64151652 0.60962482 0.50003463 0.62342227 0.2665956
 0.69270899 0.53755268 0.16017904 0.176256   0.99404154 0.98186006
 0.39665323 0.16192503 0.07936454 0.65391317 0.37137281 0.30546019
 0.73830275 0.13484185 0.73594286 0.47608771 0.73221658 0.14206792
 0.1439193  0.9858178  0.329727   0.94741829 0.68473869 0.04396622
 0.64548751 0.72422433 0.43609677 0.92436753 0.11221202 0.91810936
 0.92589409 0.10260627 0.69672975 0.59658583 0.39769467 0.25966576
 0.69610258 0.33146537 0.43984609 0.69379975 0.98360993 0.27001445
 0.01011368 0.88869913 0.93145665 0.4096973  0.44654739 0.4934832
 0.6771909  0.54612033 0.87443644 0.16420731 0.5099302  0.91524968
 0.78402818 0.8040045  0.90458817 0.92598048 0.88952764 0.81117954
 0.21512997 0.03139969 0.99272773 0.87770645 0.27654641 0.99549482
 0.2581852  0.58655182 0.94270064 0.46372147 0.51050069 0.46347731
 0.15196314 0.95139803 0.55482955 0.21318503 0.52773462 0.36631682
 0.17465313 0.94999824 0.85125903 0.72701868 0.19991997 0.9575182
 0.79944004 0.94591186 0.20801483 0.53860596 0.80529659 0.78658044
 0.38289245 0.41174385 0.31818534 0.19421736 0.7377181  0.08516934
 0.99438707 0.15446647 0.90631103 0.05335296 0.96475511 0.44915392
 0.52233395 0.93224812 0.30598395 0.20690804 0.69074727 0.1758876
 0.78977111 0.30519627 0.64599241 0.8867638  0.84399357 0.87085428
 0.43291064 0.83143084 0.60648611 0.62821162 0.01498514 0.5821843
 0.37380284 0.49090324 0.20672688 0.1864248  0.41198352 0.65136605
 0.58288486 0.07358951 0.76295318 0.27127926 0.32845858 0.14437203
 0.173498   0.49636009 0.37450108 0.78818386 0.90746862 0.72201026
 0.37845793 0.55085835 0.00996745 0.58134784 0.04425943 0.7314378
 0.91791208 0.57671874 0.68988426 0.41943467 0.80635856 0.34523312
 0.02413422 0.49916292 0.91669522 0.83371834 0.50673715 0.79076498
 0.48849737 0.91817126 0.826347   0.58994162 0.3009371  0.51062834
 0.86129911 0.56285729 0.4409985  0.08913092 0.63481725 0.91568606
 0.96414639 0.37678138 0.98233317 0.25718984 0.57308335 0.39427164
 0.5957509  0.14258514 0.75278841 0.01482661 0.9600075  0.01616268
 0.56605511 0.71651328 0.9280034  0.27716871 0.44972427 0.57741475
 0.67475889 0.15615548 0.69384144 0.58047952 0.20989928 0.20775233
 0.72031465 0.91577801 0.51478061 0.41321517 0.43084777 0.12387955
 0.0354134  0.6527209  0.91181108 0.49562213 0.09488883 0.18440799
 0.58663233 0.92143144 0.23140528 0.652848   0.46256866 0.95436601
 0.06416764 0.94937639 0.70764214 0.00998542 0.89872879 0.01949307
 0.07597977 0.62732966 0.3662192  0.63879201 0.93115525 0.36577547
 0.8249341  0.15600818 0.70220383 0.71615314 0.22975876 0.62651725
 0.91906739 0.23394784 0.68438591 0.39112923 0.64566301 0.70808515
 0.32892682 0.90032538 0.93354519 0.25158927 0.52645731 0.12255468
 0.70985193 0.85059373 0.51779765 0.96711032 0.66405724 0.38649619
 0.79198747 0.07742723 0.72028675 0.34161875 0.32593513 0.92880262
 0.03937617 0.36108013 0.87671325 0.05194249 0.71797994 0.78339698
 0.72038442 0.13547379 0.56982276 0.98236125 0.41813075 0.73318803
 0.972058   0.61586887 0.17127031 0.98360029 0.06255483 0.6727964
 0.08481237 0.17112213 0.27298717 0.42760796 0.24291886 0.69298621
 0.48150819 0.09335294 0.78450701 0.67527625 0.22052433 0.23882669
 0.92432051 0.01342551 0.51491651 0.19867492 0.99812658 0.22941091
 0.21304085 0.42630064 0.82268407 0.09236449 0.92819818 0.95350152
 0.72880828 0.32029553 0.63415073 0.0194118  0.97437537 0.87580108
 0.95369056 0.45711107 0.515649   0.34659841 0.92849216 0.81172105
 0.509993   0.09262414 0.78885768 0.32770762 0.91640897 0.64443915
 0.39795232 0.96733155 0.02715884 0.87686194 0.63445649 0.21950005
 0.72021805 0.13781542 0.51728076 0.19144983 0.44825417 0.59059633
 0.48952134 0.45714647 0.3040494  0.20960133 0.32671014 0.40218573
 0.92870179 0.85076731 0.39131584 0.53640572 0.88239282 0.75137653
 0.47432933 0.07766126 0.29020467 0.46983279 0.46084867 0.27677034
 0.85621887 0.33792675 0.17183996 0.45067145 0.61855846 0.00585159
 0.22064567 0.36676996 0.58454024 0.81399216 0.25045856 0.33702794
 0.27679548 0.73397653 0.44446493 0.03725    0.75244623 0.72687317
 0.57232654 0.32950942 0.26649731 0.18041617 0.07070181 0.28934017
 0.55736227 0.03134783 0.08077981 0.75876237 0.17792099 0.65396015
 0.37998962 0.60097019 0.65393368 0.14857336 0.13278092 0.39268785
 0.92452404 0.26579332 0.24246475 0.30851323 0.91381387 0.75924997
 0.25199167 0.30523148 0.26154592 0.82905661 0.22016521 0.18976125
 0.7687393  0.48222411 0.00246821 0.86014347 0.02847541 0.54915438
 0.17131333 0.33410852 0.18873989 0.64075192 0.12017962 0.424341
 0.67879328 0.68018913 0.69558076 0.26746968 0.38261713 0.64419816
 0.25576897 0.75708799 0.97692728 0.13687677 0.90664249 0.73964625
 0.47430408 0.52641674 0.37795799 0.52901099 0.13100512 0.76230205
 0.39790435 0.45389866 0.48645059 0.76077902 0.79707623 0.54006298
 0.03467766 0.53720476 0.92092509 0.71580342 0.53102348 0.08238264
 0.04281031 0.31913377 0.3732287  0.95577676 0.86347299 0.79248696
 0.82196635 0.63868361 0.24095212 0.97593365 0.12633774 0.70857292
 0.94857292 0.46164408 0.96152167 0.27270011 0.19301543 0.88639968
 0.79293363 0.55970435 0.74197309 0.17192234 0.41203987 0.06536657
 0.31951509 0.60798456 0.48901602 0.13474449 0.29831632 0.62943042
 0.24292543 0.54545283 0.53995625 0.49819196 0.73138353 0.96892677
 0.49204673 0.46319263 0.99122906 0.96606601 0.16475955 0.25594566
 0.24613977 0.14777044 0.72200194 0.61368861 0.35801845 0.02226919
 0.90926603 0.90162143 0.30012797 0.23900152 0.94934605 0.08034212
 0.99386322 0.80349484 0.65636833 0.37193512 0.95614265 0.07002415
 0.37260549 0.67662327 0.13966893 0.95986761 0.40741371 0.74045941
 0.54511333 0.66724463 0.22591419 0.33255364 0.86034936 0.87124248
 0.90317018 0.35808086 0.20596244 0.12446984 0.23744423 0.47522826
 0.12886857 0.40707188 0.63807086 0.46668794 0.53898302 0.65749378
 0.77724389 0.15254997 0.31848117 0.90448217 0.64024203 0.92294844
 0.2462079  0.49512042 0.16994597 0.98096038 0.86643517 0.71770015
 0.82763061 0.34651585 0.81312655 0.41490274 0.75353974 0.81596507
 0.26896383 0.69768398 0.1446319  0.92838426 0.78575438 0.06703961
 0.75362778 0.66465973 0.1391475  0.98801026 0.50036666 0.53465861
 0.93696971 0.08963597 0.130168   0.45694894 0.6997829  0.19792993
 0.9037994  0.06940991 0.14086644 0.1270511  0.63279474 0.48776842
 0.82820606 0.75778737 0.69551452 0.68922043 0.92916254 0.58583031
 0.72897613 0.70227248 0.71022646 0.38171433 0.43903076 0.11542376
 0.13893926 0.74682416 0.32424889 0.45990769 0.62477524 0.85709742
 0.31498058 0.38671626 0.70177874 0.18516299 0.81677225 0.90166398
 0.23530408 0.59946844 0.83949689 0.5012926  0.8684346  0.50892304
 0.83049926 0.90936599 0.96974579 0.89708556 0.05234917 0.19319687
 0.85965424 0.6406374  0.29113948 0.48741269 0.60694125 0.58682172
 0.26644584 0.22767221 0.48384039 0.6309091  0.43859371 0.91172564
 0.50100679 0.60584229 0.50180348 0.65925519 0.28526382 0.83841999
 0.92274518 0.91498983 0.32538886 0.88077011 0.01153458 0.65819943
 0.58130218 0.74734097 0.90701778 0.23418531 0.47312468 0.07418034
 0.82864059 0.89174488 0.568708   0.35680296 0.06651723 0.34416215
 0.0655406  0.79214165 0.03453119 0.82894948 0.8751038  0.41356315
 0.20030642 0.23836829 0.28923332 0.21806967 0.73298359 0.24702581
 0.82225449 0.53415305 0.92240913 0.57697222 0.53353487 0.01877045
 0.13006661 0.80215286 0.22965782 0.46137891 0.95092028 0.32715057
 0.03579213 0.87716797 0.63859985 0.76439134 0.01408399 0.53487818
 0.02223567 0.5534811  0.62694435 0.36705052]


array1 = np.random.randint(0,50, size=50)
print(array1)

[35 43  4  6 10 44 43 40  9 48  0 17  3  9 18 26 12 29  4  6  8 41 32 23
 31 13 49 48  7 13 17  7 33  4 19 47 32 17 48 25 25 11  9  4 33 47 24 46
  7 24]


np.random.seed(444)
array1 = np.random.randint(0,50, size=50)
print(array1)

[ 3 48 23  8  3 39 12 47  8 41 44 10 26 15 34 18 12 10 16 24  0  6 22  0
 10  3 48  6 39 25 35 26  8  7 13  2 15  9 34  9 24 25  2 18  8 44  1 26
 26 45]


PdSeries = pd.Series([1, 2, 3, 4, 5,6,7,8,9,10])

print(PdSeries)

0     1
1     2
2     3
3     4
4     5
5     6
6     7
7     8
8     9
9    10
dtype: int64


# Indexing the third value from the Pandas series
PdSeries[2]

3


# Creating a DataFrame

df = pd.DataFrame({'Name':["Student A", "Student B", "Student C"],
                          'Year': ["Third Year", "Second Year", "Second Year"],
                          'Position':["Treasurer","Senator","President"]})

# Using the print function gives you an in-text DataFrame
print(df)

        Name         Year   Position
0  Student A   Third Year  Treasurer
1  Student B  Second Year    Senator
2  Student C  Second Year  President

df


# Creating an array that has 10 rows and 5 columns
array1 = np.arange(1,100,2).reshape(10,5)

# Creating DataFrame and using 'columns' argument to assign names to the columns in DF
df = pd.DataFrame(array1, columns=['var1','var2','var3','var4','var5'])
df


# Reading dataframe from a link online
df = pd.read_csv('https://raw.githubusercontent.com/nytimes/covid-19-data/master/us-states.csv')
df


# Indexing DataFrame to only give us data on the states
df['state']

0           Washington
1           Washington
2           Washington
3             Illinois
4           Washington
             ...      
61937         Virginia
61938       Washington
61939    West Virginia
61940        Wisconsin
61941          Wyoming
Name: state, Length: 61942, dtype: object


# Indexing DataFrame to only give us data on the states and cases
# When indexing multiple variables, we need to include a second set of brackets
df[['state','cases']]


# Indexing DataFrame by slicing/telling Python to get specific rows
df[10:20]


# Subsetting DataFrame to only give us the states that are california and nothing else
df[df['state']=='California']


# Subsetting DataFrame to only give us all the states except California
df[~(df['state']=='California')]


df2 = df[~(df['state']=='California')].reset_index()
df2


df = df[(df['state']=='California') & (df['cases']>500)].reset_index()
df


df = pd.read_csv('https://raw.githubusercontent.com/nytimes/covid-19-data/master/us-states.csv')

# Subsetting DataFrame to include all states that are not CA and all cases under 500
df = df[~(df['state']=='California') & ~(df['cases']>500)].reset_index()
df


df = pd.read_csv('https://raw.githubusercontent.com/nytimes/covid-19-data/master/us-states.csv')

df[(df['state']=="California") | (df['state']=="New York")]


df = pd.read_csv('https://raw.githubusercontent.com/nytimes/covid-19-data/master/us-states.csv')

df[(df['state']=="California") | (df['state']=="New York") | (df['state']=="Washington") | (df['state']=="Texas")]


df = pd.read_csv('https://raw.githubusercontent.com/nytimes/covid-19-data/master/us-states.csv')

# Creating a new variable
df['death/case ratio'] = 0

# Like in R, you can use the head() function to give you the first 5 observations
df.head()


# Creating a new variable/replacing a current one with new values; in this case, we will create a variable that calculates the ratio of deaths to cases
# Python will calculate what you want it to and will do it per row
df['death/case ratio'] = (df['deaths']/df['cases'])

# Like in R, you can use the tail() function to get the last 5 observations
df.tail()


# Replacing observations from DataFrame across all columns from the 0th to 2nd observation
# Copying a df using the copy(); this ensures that changes are not affected across all dataframes
df2 = df.copy()

# Telling Python to replace all columns from the 0th row to the 2nd with 1000
df2.iloc[0:3,:]=1000

df2.head()


# Telling Python to replace first and second column from the 0th row to the 2nd with 1000
df2.iloc[0:3,0:2]='DataFrame'

df2.head()


# Telling Python to replace all observations in the fips column
df2.loc[:,'fips']='Python'

df2.head()


## Dropping last variable, death/case ratio
## Axis 1 is columns
## Inplace True means that these changes will be reflected in the dataframe; False means it will only be reflected in the code snippet output
df2.drop('death/case ratio', axis=1, inplace= True)
df2.head()


## Dropping the first three rows
## Axis is 0 for rows
df2.drop([0,1,2], axis=0, inplace= True)
df2 = df2.reset_index()
df2.head()


## Dropping the first 50 rows
## Axis is 0 for rows
df2.drop(range(0,50), axis=0, inplace= True)
df2 = df2.reset_index()
df2.head()


df2.drop(['level_0','index'], axis=1, inplace= True)
df2.head()


df['var7'] = '0'

df.dtypes

date                 object
state                object
fips                  int64
cases                 int64
deaths                int64
death/case ratio    float64
var7                 object
dtype: object


df['var7'] = df['var7'].astype(int)

df.dtypes

date                 object
state                object
fips                  int64
cases                 int64
deaths                int64
death/case ratio    float64
var7                  int64
dtype: object


df['var7'] = df['var7'].astype(str)

df.dtypes

date                 object
state                object
fips                  int64
cases                 int64
deaths                int64
death/case ratio    float64
var7                 object
dtype: object


# Group DataFrame by state in ascending order
df.sort_values(by=['state'], inplace=True)

df.head()


# Group DataFrame by state in descending order
df.sort_values(by=['state'], inplace=True, ascending=False)

df.head()


# Grouping DataFrame by state and fips codes to get the total number of observations per state
df.groupby("state")['fips'].count()

state
Alabama                     1106
Alaska                      1107
American Samoa               548
Arizona                     1153
Arkansas                    1108
California                  1154
Colorado                    1114
Connecticut                 1111
Delaware                    1108
District of Columbia        1112
Florida                     1118
Georgia                     1117
Guam                        1104
Hawaii                      1113
Idaho                       1106
Illinois                    1155
Indiana                     1113
Iowa                        1111
Kansas                      1112
Kentucky                    1113
Louisiana                   1110
Maine                       1107
Maryland                    1114
Massachusetts               1147
Michigan                    1109
Minnesota                   1113
Mississippi                 1108
Missouri                    1112
Montana                     1106
Nebraska                    1131
Nevada                      1114
New Hampshire               1117
New Jersey                  1115
New Mexico                  1108
New York                    1118
North Carolina              1116
North Dakota                1108
Northern Mariana Islands    1091
Ohio                        1110
Oklahoma                    1113
Oregon                      1120
Pennsylvania                1113
Puerto Rico                 1106
Rhode Island                1118
South Carolina              1113
South Dakota                1109
Tennessee                   1114
Texas                       1136
Utah                        1123
Vermont                     1112
Virgin Islands              1105
Virginia                    1112
Washington                  1158
West Virginia               1102
Wisconsin                   1143
Wyoming                     1108
Name: fips, dtype: int64


# Grouping DataFrame by state and getting the sum of deaths
df.groupby("state")['deaths'].sum()

state
Alabama                     13758382
Alaska                        747325
American Samoa                 11048
Arizona                     21257570
Arkansas                     7905784
California                  67503932
Colorado                     8976123
Connecticut                  9082510
Delaware                     2113929
District of Columbia         1160633
Florida                     52694427
Georgia                     25872159
Guam                          239558
Hawaii                        945585
Idaho                        3264273
Illinois                    28833752
Indiana                     16853649
Iowa                         6953414
Kansas                       6182474
Kentucky                    10223505
Louisiana                   13259185
Maine                        1464390
Maryland                    11055992
Massachusetts               18434286
Michigan                    26110438
Minnesota                    9306987
Mississippi                  9103032
Missouri                    13875907
Montana                      2246802
Nebraska                     3034055
Nevada                       7437941
New Hampshire                1777416
New Jersey                  28482863
New Mexico                   5393922
New York                    59061061
North Carolina              17314387
North Dakota                 1729703
Northern Mariana Islands       17462
Ohio                        25744838
Oklahoma                     9457851
Oregon                       4865473
Pennsylvania                32653246
Puerto Rico                  3223949
Rhode Island                 2855775
South Carolina              12543645
South Dakota                 2159513
Tennessee                   17188151
Texas                       62871518
Utah                         3194695
Vermont                       434397
Virgin Islands                 72839
Virginia                    14102858
Washington                   8960697
West Virginia                4434127
Wisconsin                   10004696
Wyoming                      1164870
Name: deaths, dtype: int64


# Grouping DataFrame by state and getting the maximum number of deaths per state
df.groupby("state")['deaths'].max()

state
Alabama                      21631
Alaska                        1438
American Samoa                  34
Arizona                      33190
Arkansas                     13068
California                  104277
Colorado                     14245
Connecticut                  12270
Delaware                      3352
District of Columbia          1487
Florida                      87141
Georgia                      41055
Guam                           421
Hawaii                        1851
Idaho                         5456
Illinois                     41618
Indiana                      26179
Iowa                         10770
Kansas                       10232
Kentucky                     18348
Louisiana                    18835
Maine                         2981
Maryland                     16672
Massachusetts                24441
Michigan                     42311
Minnesota                    14964
Mississippi                  13431
Missouri                     23998
Montana                       3701
Nebraska                      5068
Nevada                       12093
New Hampshire                 3018
New Jersey                   36097
New Mexico                    9110
New York                     80138
North Carolina               29746
North Dakota                  2529
Northern Mariana Islands        41
Ohio                         42061
Oklahoma                     16549
Oregon                        9451
Pennsylvania                 50701
Puerto Rico                   5848
Rhode Island                  3915
South Carolina               20192
South Dakota                  3222
Tennessee                    29035
Texas                        94518
Utah                          5316
Vermont                        939
Virgin Islands                 130
Virginia                     23782
Washington                   15905
West Virginia                 8132
Wisconsin                    16485
Wyoming                       2014
Name: deaths, dtype: int64


# Getting Data Types of our current DataFrame
df.dtypes

date                 object
state                object
fips                  int64
cases                 int64
deaths                int64
death/case ratio    float64
var7                 object
dtype: object


# Converting current date to datetime type
df['date'] = pd.to_datetime(df['date'])

df.dtypes

date                datetime64[ns]
state                       object
fips                         int64
cases                        int64
deaths                       int64
death/case ratio           float64
var7                        object
dtype: object


# Creating year variable out of the date variable
df['year'] = df['date'].dt.year

# Grouping by state and year to get sum of deaths per year
df.groupby(['state','year'])['deaths'].sum()

state      year
Alabama    2020     526388
           2021    4330518
           2022    7146725
           2023    1754751
Alaska     2020      13147
                    ...   
Wisconsin  2023    1330461
Wyoming    2020      20734
           2021     316182
           2022     664976
           2023     162978
Name: deaths, Length: 223, dtype: int64


# Grouping by state and year to get sum of deaths per year, highest amount of deaths per year, and average deaths per state
df.groupby(['state','year'])['deaths'].agg(['sum','max','mean'])


# Creating new df in order to subset it to include CA and TX separately
df_ca = df.copy()
df_tx = df.copy()

# Subsetting df to get CA
df_ca = df_ca[df_ca['state']=='California']
df_tx = df_tx[df_tx['state']=='Texas']

# Subsetting CA to split variables
df3 = df_ca[['date','state','cases']].reset_index(drop=True)
df4 = df_ca[['date','deaths','year']].reset_index(drop=True)

# Sorting new df by date
df_ca.sort_values(by=['date'], inplace=True)
df_tx.sort_values(by=['date'], inplace=True)
df3.sort_values(by=['date'], inplace=True)
df4.sort_values(by=['date'], inplace=True)

# Resetting index | drop = true ensures we don't have an index variable in the df
df_ca = df_ca.reset_index(drop=True)
df_tx = df_tx.reset_index(drop=True)
df3 = df3.reset_index(drop=True)
df4 = df4.reset_index(drop=True)

# Dropping unwanted variables
df_ca.drop(['death/case ratio','fips','var7'], axis=1, inplace=True)
df_tx.drop(['death/case ratio','fips','var7'], axis=1, inplace=True)


df_both_states = pd.concat([df_ca, df_tx], ignore_index=True)
df_both_states.head()


df5 = df3.merge(df4, on="date", how='inner')
df5.head()


import matplotlib.pyplot as plt


Y = [100,200,300,400,500,600,700,800]
X = [2016,2017,2018,2019,2020,2021,2022,2023]


# plotting the data
plt.plot(X, Y)

# Adding a  title to our plot
plt.title("Line Plot")

# Adding labels to our plot
plt.ylabel("y-axis")
plt.xlabel("x-axis")
plt.show()


# plotting the data
plt.scatter(X, Y)

# Adding a  title to our plot
plt.title("Scatter Plot")

# Adding labels to our plot
plt.ylabel("y-axis")
plt.xlabel("x-axis")
plt.show()


# plotting the data
plt.bar(X, Y)

# Adding a  title to our plot
plt.title("Bar Plot")

# Adding labels to our plot
plt.ylabel("y-axis")
plt.xlabel("x-axis")
plt.show()


# plotting the data once more
plt.plot(X, Y, color='coral', marker='o', linestyle='dashed'
         )

# Adding a  title to our plot just as before
plt.title("Line Plot")

# Adding labels to our plot
plt.ylabel("y-axis")
plt.xlabel("x-axis")
plt.show()


# Reading dataframe from a link online
df = pd.read_csv('https://raw.githubusercontent.com/nytimes/covid-19-data/master/us-states.csv')

# Grouping DataFrame by state and cases to get the total number of COVID cases per state
df = df.groupby("state")['cases'].count()

# Keeping only the first 10 states in the data frame
df = df[0:10]


# plotting the COVID data
# 'kind' tells Python to plot a bar graph
# 'width' tells python to alter the width of the bars; default is set to 0.8
df.plot(kind='bar', x='state', y='cases', color='y', width=0.4)

# adding title
plt.title("Frequency of COVID Cases Per State Plot")

# adding axis-labels
plt.ylabel("Cases")
plt.xlabel("State")

Text(0.5, 0, 'State')


# Unlike the packages above, our program does not have seaborn internally, so we must use the `conda install` operator to download the library in order to use it. For easier downloading, remove the hash below and click on the icon that appears above the "conda install seaborn" line of code and click on install. After, it should be easy to import the seaborn package.
# conda install seaborn
import seaborn as sns


# Reading dataframe from a link online
df = pd.read_csv('https://raw.githubusercontent.com/nytimes/covid-19-data/master/us-states.csv')

# Subsetting df to only include CA
df = df[(df['state']=="California")]

# Converting current date to datetime type
df['date'] = pd.to_datetime(df['date'])

# Creating year variable out of date variable
df['year'] = df['date'].dt.year

# Organizing data even further to remove cumulative deaths per day
df['cases_(-1)']=df['cases'].shift(1)
df['non_cum_cases']= df['cases']-df['cases_(-1)']


sns.lineplot(data=df, x='year', y='non_cum_cases')

<AxesSubplot:xlabel='year', ylabel='non_cum_cases'>


# Setting theme style
sns.set_style('ticks')

# ci as 'False' removes the confidence intervals
# linestyle changes the style of the line
# color changes the color of the line
plot = sns.lineplot(data=df, x='year', y='non_cum_cases', color='y', linestyle = 'solid', ci=False)

# Adding title and labels
plot.set_title('Average COVID Cases per Year (CA)', fontdict={'size': 18, 'weight': 'normal'})
plot.set_xlabel('Year', fontdict={'size': 12})
plot.set_ylabel('COVID Cases (Avg)', fontdict={'size': 12})

# Saving figure to your directory
fig = plot.get_figure()
fig.savefig('output.png')

	date	state	fips	cases	deaths	death/case ratio
25974	2021-06-18	Alabama	1	548657	11306	0.020607
22784	2021-04-21	Alabama	1	524367	10807	0.020610
3424	2020-05-04	Alabama	1	8112	298	0.036736
22839	2021-04-22	Alabama	1	525049	10824	0.020615
39206	2022-02-11	Alabama	1	1259580	17505	0.013897

Python for Social Science Workshop - Lesson 2

Jose J Alcocer

April 11, 2023

Working With NumPy, Pandas, Matplot, and Seaborn

1.0 Setting up the environment

1.1 Loops and Conditional Statements

1.1.1 For Loops

1.1.2 While Loops

1.1.3 If and Else Statements

1.1.4 Elif (Else if) in If-Else Statements

1.2 NumPy (Numerical Python) Library

1.2.1 Creating Arrays

1.2.2 Indexing Arrays

1.2.3 NumPy Statistics

1.3 Pandas (Panel Python) Library

1.3.1 Creating a Pandas Series

1.3.2 Creating a Pandas DataFrame

1.3.3 Indexing DataFrames

1.3.4 Subsetting DataFrames

Subsetting using the 'And' Operator¶

Subsetting using the 'Or' Operator

1.3.5 Transforming DataFrames

Creating/Replacing New Variables/Columns to DataFrame

Dropping/Deleting Variables/Columns to DataFrame

Changing Variable Data Types in DataFrame

Sorting and Grouping Data

Concatenating (Rows) and Merging (Columns) Data

1.4 Using Matplot

1.4.1 Creating Plots Out of Lists (Vectors)

1.4.2 Adding Additional Plot Arguments

1.4.3 Plotting Variables From A DataFrame

1.5 Using the Seaborn Library

	Name	Year	Position
0	Student A	Third Year	Treasurer
1	Student B	Second Year	Senator
2	Student C	Second Year	President

	var1	var2	var3	var4	var5
0	1	3	5	7	9
1	11	13	15	17	19
2	21	23	25	27	29
3	31	33	35	37	39
4	41	43	45	47	49
5	51	53	55	57	59
6	61	63	65	67	69
7	71	73	75	77	79
8	81	83	85	87	89
9	91	93	95	97	99

	date	state	fips	cases	deaths
0	2020-01-21	Washington	53	1	0
1	2020-01-22	Washington	53	1	0
2	2020-01-23	Washington	53	1	0
3	2020-01-24	Illinois	17	1	0
4	2020-01-24	Washington	53	1	0
...	...	...	...	...	...
61937	2023-03-23	Virginia	51	2298300	23782
61938	2023-03-23	Washington	53	1940704	15905
61939	2023-03-23	West Virginia	54	645710	8132
61940	2023-03-23	Wisconsin	55	2014524	16485
61941	2023-03-23	Wyoming	56	185800	2014

	date	state	fips	cases
10	2020-01-26	Illinois	17	1
11	2020-01-26	Washington	53	1
12	2020-01-27	Arizona	4	1
13	2020-01-27	California	6	2
14	2020-01-27	Illinois	17	1
15	2020-01-27	Washington	53	1
16	2020-01-28	Arizona	4	1
17	2020-01-28	California	6	2
18	2020-01-28	Illinois	17	1
19	2020-01-28	Washington	53	1

	date	state	fips	cases	deaths
5	2020-01-25	California	6	1	0
9	2020-01-26	California	6	2	0
13	2020-01-27	California	6	2	0
17	2020-01-28	California	6	2	0
21	2020-01-29	California	6	2	0
...	...	...	...	...	...
61667	2023-03-19	California	6	12153083	104130
61723	2023-03-20	California	6	12154293	104165
61779	2023-03-21	California	6	12154941	104185
61835	2023-03-22	California	6	12155467	104196
61891	2023-03-23	California	6	12169158	104277

	index	date	state	fips	cases	deaths
0	746	2020-03-16	California	6	588	11
1	799	2020-03-17	California	6	732	14
2	853	2020-03-18	California	6	893	17
3	907	2020-03-19	California	6	1067	19
4	961	2020-03-20	California	6	1283	24
...	...	...	...	...	...	...
1098	61667	2023-03-19	California	6	12153083	104130
1099	61723	2023-03-20	California	6	12154293	104165
1100	61779	2023-03-21	California	6	12154941	104185
1101	61835	2023-03-22	California	6	12155467	104196
1102	61891	2023-03-23	California	6	12169158	104277

	date	state	fips	cases	deaths	death/case ratio
0	1000	1000	1000	1000	1000	1000.0
1	1000	1000	1000	1000	1000	1000.0
2	1000	1000	1000	1000	1000	1000.0
3	2020-01-24	Illinois	17	1	0	0.0
4	2020-01-24	Washington	53	1	0	0.0

	date	state	fips	cases	deaths	death/case ratio
0	DataFrame	DataFrame	1000	1000	1000	1000.0
1	DataFrame	DataFrame	1000	1000	1000	1000.0
2	DataFrame	DataFrame	1000	1000	1000	1000.0
3	2020-01-24	Illinois	17	1	0	0.0
4	2020-01-24	Washington	53	1	0	0.0

	date	state	fips	cases	deaths	death/case ratio
0	DataFrame	DataFrame	Python	1000	1000	1000.0
1	DataFrame	DataFrame	Python	1000	1000	1000.0
2	DataFrame	DataFrame	Python	1000	1000	1000.0
3	2020-01-24	Illinois	Python	1	0	0.0
4	2020-01-24	Washington	Python	1	0	0.0

	level_0	index	date	state	fips	cases
0	50	53	2020-02-05	California	Python	6
1	51	54	2020-02-05	Illinois	Python	2
2	52	55	2020-02-05	Massachusetts	Python	1
3	53	56	2020-02-05	Washington	Python	1
4	54	57	2020-02-05	Wisconsin	Python	1

	date	state	fips	cases	deaths	death/case ratio
61941	2023-03-23	Wyoming	56	185800	2014	0.010840
57741	2023-01-07	Wyoming	56	183151	1959	0.010696
47213	2022-07-03	Wyoming	56	164456	1834	0.011152
47661	2022-07-11	Wyoming	56	165619	1834	0.011074
43629	2022-04-30	Wyoming	56	156550	1812	0.011575

	var1	var2	var3	var4	var5
0	1	3	5	7	9
1	11	13	15	17	19
2	21	23	25	27	29
3	31	33	35	37	39
4	41	43	45	47	49
5	51	53	55	57	59
6	61	63	65	67	69
7	71	73	75	77	79
8	81	83	85	87	89
9	91	93	95	97	99

	date	state	cases	year
0	2020-01-25	California	1	2020
1	2020-01-26	California	2	2020
2	2020-01-27	California	2	2020
3	2020-01-28	California	2	2020
4	2020-01-29	California	2	2020

	var1	var2	var3	var4	var5
0	1	3	5	7	9
1	11	13	15	17	19
2	21	23	25	27	29
3	31	33	35	37	39
4	41	43	45	47	49
5	51	53	55	57	59
6	61	63	65	67	69
7	71	73	75	77	79
8	81	83	85	87	89
9	91	93	95	97	99