cqs

pdf

School

Binghamton University *

*We aren’t endorsed by this school

Course

311

Subject

Statistics

Date

Jan 9, 2024

Type

pdf

Pages

2

Uploaded by JusticeJellyfish31029

Report
import matplotlib.pyplot as plt # Create a Pandas DataFrame from a dictionary d = {'one': [1, 2, 3, 4], 'two': [4, 3, 2, 1]} df = pd.DataFrame(d) df import pandas as pd # Create a Pandas Series s = pd.Series([1, 3, 5, 6, 8, 10]) s 0 1 1 3 2 5 3 6 4 8 5 10 dtype: int64 df.head() shows first 5 values df.tail() shows last 5 values df.shape shows shape of chart (doesn’t include labels) df.columns shows columns df.describe shows statistics # Importing data from csv file df = pd.read_csv('weather.csv') df # What was the maximum temperature in the month? df['Temperature'].max() # On which days did it snow? df['EST'][df['Events'] == 'Snow'] # How many days did it rain? df['EST'][df['Events'] == 'Rain'].count() # What was the average speed of wind during the month? df['WindSpeedMPH'].mean() # Select the days with temperature lesser than 30. df[df['Temperature'] < 30] # What was the average temperature for each event? df.groupby('Events').mean()['Temperature'] # Which day had the highest humidity? df['EST'][df['Humidity'] == df['Humidity'].max()] # Sort the temperature from lowest to highest values df.sort_values('Temperature') # Making changes to data # Adding a new column # FORMULA FOR CONVERTING F TO C = (Temperature in F - 32) * 5/9 df['TemperatureInC'] = (df['Temperature'] - 32) * 5/9 # Dropping a column df = df.drop(columns = ['TemperatureInC']) # Creating a Pandas DataFrame from a list of tuples weather_data = [('1/1/2022', 32, 6, 'Rain'), ('1/2/2022', 35, 7, 'Sunny'), ('1/3/2022', 28, 2, 'Snow'), ('1/4/2022', 24, 7, 'Snow'), ('1/5/2022', 32, 4, 'Rain'), ('1/6/2022', 31, 2, 'Sunny'), ('1/7/2022', 28, 6, 'Rain')] df = pd.DataFrame(weather_data, columns=['day', 'temperature', 'windspeed', 'event']) df df['temperature'] df['temperature'].max() df['temperature'] == df['temperature'].max() df[df['temperature'] == df['temperature'].max()] select rows which has maximum temperature df['day'][df['temperature'] == df['temperature'].max()] select only day column which has maximum temperature df.sort_values('temperature') Sorting the dataframe by temperature values (lowest to highest) df.sort_values('temperature', ascending = False) Sorting the dataframe by temperature values (highest to lowest) df.groupby('event').mean() Grouping the dataframe by event and finding the mean for each event df['day'] selects a single column type(df['day']) pandas.core.series.S eries df[['day', 'event']] selects multiple columns type(df[['day', 'event']]) pandas.core.frame.Da taFrame df = pd.read_excel('glassdoor_reviews.xls') pd.set_option('display.max_columns', None) pd.set_option('max_colwidth', None) df # Select columns 'pros','cons','overallratings','jobtitle' where the value of 'cposemo' is equal to the max. value in the dataset df[['pros','cons','overallratings','jobtitle']][df['cposemo '] == df['cposemo'].max()] # Select columns 'company','location','dates','jobtitle','pros','cons','over allratings' # where the value of 'helpfulcount' is the max. value in the dataset df[['company','location','dates','jobtitle','pros','cons',' overallratings']][df['helpfulcount'] == df['helpfulcount'].max()] # Select columns 'company','location','dates','jobtitle','pros','cons','over allratings' # where the value of 'helpfulcount' is greater than 25 df[['company','location','dates','jobtitle','pros','cons',' overallratings']][df['helpfulcount'] > 25] # Filtering data # Select all rows where company == 'google' df[df['company'] == 'google'] # Selecting reviews of the company 'google' and overall rating 'greater than 4' df[(df['company'] == 'google') & (df['overallratings'] > 4)] # Selecting number of reviews of the company 'google' and overall rating 'greater than 4' df['company'][(df['company'] == 'google') & (df['overallratings'] > 4)].count() # Selecting all company reviews with overall rating greater than 4 or helpful count greater than 20 df[(df['overallratings'] > 4) | (df['helpfulcount'] > 20)] # Select data whose value "is in" a list of values df[df['company'].isin(['apple', 'microsoft'])] # Filtering jobtitle containing the text 'Analyst' df[df['jobtitle'].str.contains('Analyst')] # Get the count of reviews having 'Intern' in the job title df['jobtitle'][df['jobtitle'].str.contains('Intern')].count() # Summary functions df['overallratings'].describe() # Get distinct values df['company'].unique() # Count number of rows with each unique value of variable df['company'].value_counts() # Groupwise analysis df.groupby('company').mean() # Groupby with sorting df.groupby('company').mean().sort_values('overallrating s', ascending = False)[['overallratings','status']] df.groupby('company')['overallratings'].mean() df.groupby('company').sum()['helpfulcount'] # Groupby analysis df.groupby('status').mean()[['overallratings']] # Groupby multiple columns df.groupby(['company','status']).mean() The keys in a dictionary must be unique. True W e can create a NumPy array by using the array() function. True Pie charts show the number of observations within each given interval. False
import numpy as np arr = numpy.array([1, 2, 3, 4, 5]) print(arr) print(type(arr)) [1 2 3 4 5] <class 'numpy.ndarray'> arr = np.array([[1, 2, 3], [4, 5, 6]]) [[1 2 3] [4 5 6]] import numpy as np a = np.array([1, 2, 3, 4, 5]) b = np.array([[1, 2, 3], [4, 5, 6]]) print(a.ndim) print(b.ndim) 1 2 # Array indexing for 1-D array import numpy as np arr = np.array([1, 2, 3, 4]) print(arr[0]) 1 # Array indexing for 2-D array import numpy as np arr = np.array([[1,2,3,4,5], [6,7,8,9,10]]) print('2nd element on 1st dim: ', arr[0, 1]) 2nd element on 1st dim: 2 # Slicing 1-D array arr = np.array([1, 2, 3, 4, 5, 6, 7]) print(arr[1:5]) [2 3 4 5] # Slicing 2-D array arr = np.array([[1, 2, 3, 4, 5], [6, 7, 8, 9, 10]]) print(arr[1, 1:4]) [7 8 9] import numpy as np arr = np.array([[1, 2, 3, 4], [5, 6, 7, 8]]) print(arr.ndim) print(arr.size) print(arr.shape) 2 8 (2, 4) import numpy as np arr = np.array([1, 2, 3, 4]) print(arr.dtype) int32 ndarray.ndim will tell you the number of axes, or dimensions, of the array. ndarray.size will tell you the total number of elements of the array. This is the product of the elements of the array’s shape. ndarray.shape will display a tuple of integers that indicate the number of elements stored along each dimension of the array. If, for example, you have a 2-D array with 2 rows and 3 columns, the shape of your array is (2, 3). import numpy as np arr1 = np.array([10, 20, 30, 40]) arr2 = np.array([1, 2, 3, 4]) # Addition print(arr1 + arr2) print(np.add(arr1, arr2)) # Subtraction print(arr1 - arr2) print(np.subtract(arr1, arr2)) # Multiplication print(arr1 * arr2) print(np.multiply(arr1, arr2)) # Division print(arr1 / arr2) print(np.divide(arr1, arr2)) [11 22 33 44] [11 22 33 44] [ 9 18 27 36] [ 9 18 27 36] [ 10 40 90 160] [ 10 40 90 160] [10. 10. 10. 10.] [10. 10. 10. 10.] arr = np.array([[1, 2, 3, 4], [5, 6, 7, 8], [9, 10, 11, 12]]) # To print all the values in the array that are less than 5 print(arr[arr < 5]) [1 2 3 4] # Other operations arr1 = np.array([10, 20, 30, 40]) # Find the sum of the elements in an array print(arr1.sum()) # Find the smallest element in an array print(arr1.min()) # Find the largest element in an array print(arr1.max()) # Operation between an array and a single number print(arr1 * 1.5) print(arr1 + 1) 100 10 40 [15. 30. 45. 60.] [11 21 31 41] import numpy as np a = np.array([[0.4, 0.1, 0.3, 0.5], [0.5, 0.05, 0.4, 0.5], [0.1, 0.8, 0.2, 0.5]]) print(a.sum()) print(a.min()) print(a.max()) # Average of the array elements print('Mean:',a.mean()) # Median of the array elements print('Median:', np.median(a)) # Variance of the array elements print('Variance:',a.var()) # Standard deviation of the array elements print('Standard Deviation:',a.std()) # Product of the array elements print('Product:',a.prod()) 4.3500000000000005 0.05 0.8 Mean: 0.36250000000000004 Median: 0.4 Variance: 0.04463541666666667 Standard Deviation: 0.2112709555681203 Product: 2.4000000000000014e-07 fmt = '[marker][line][color]' barh() for horizontal bars plt.bar(category,quantity, color = 'red') red bars plt.bar(category,quantity, width = 0.2) thinner bars # pie chart sales = [200, 600, 300, 100] plt.pie(sales) plt.show() # labels sales = [200, 600, 300, 100] slice_labels = ['1st Qtr', '2nd Qtr', '3rd Qtr', '4th Qtr'] plt.pie(sales, labels = slice_labels) plt.title('Sales by Quarter') plt.show() # legend plt.legend(sales) # histogram plt.hist(ages) # adding bins plt.hist(ages, bins = 5)
Your preview ends here
Eager to read complete document? Join bartleby learn and gain access to the full version
  • Access to all documents
  • Unlimited textbook solutions
  • 24/7 expert homework help