miniproject (1)

.py

School

University of Massachusetts, Amherst *

*We aren’t endorsed by this school

Course

410

Subject

Electrical Engineering

Date

Jan 9, 2024

Type

py

Pages

5

Uploaded by ChefHeat7492

Report
# -*- coding: utf-8 -*- """MiniProject.ipynb Automatically generated by Colaboratory. Original file is located at https://colab.research.google.com/drive/1bj_Sy7v0VLzOD9X-ual0SHzhy5ehBbL8 <h1 align="center"><strong>DS/CMPSC 410 - Mini Project</strong></h1> <h2 align="center"><strong>Crime Analysis in Chicago City.</strong></h2> ## Instructor: Professor Romit Maulik ## Team Members: ### - Sai Sanwariya Narayan ### - Nikhil Melligeri ### - Shafwat Mustafa ### - Rohan Singh ### - Shengdi You ### - Daniel Gao ### - Nathan Quint """ ##pip install pyspark import pyspark import pandas as pd import numpy as np from pyspark import SparkContext from pyspark.sql import SparkSession from pyspark.sql.types import StructField, StructType, StringType, LongType, IntegerType, FloatType from pyspark.sql.functions import col, column from pyspark.sql.functions import expr from pyspark.sql.functions import split from datetime import datetime import matplotlib.pyplot as plt from pyspark.sql import Row crime=SparkSession.builder.master("local").appName("CrimeDataAnalysis").getOrCreate () crime.sparkContext.setCheckpointDir("~/scratch") """## Uploading 2022 Crime Data""" Data22 = crime.read.csv("/storage/home/njq5013/Project/Crimes_-_2022_20231016.csv", header=True, inferSchema=True) Data22.printSchema() DF_2 = Data22.select("Primary Type" , "Location Description") DF_2.take(10) """## Cleaning 2022 Crime Data""" cleandata22 = DF_2.filter(DF_2["Primary Type"].isNotNull())
cleandata22.show(10) """## Uploading 2023 Crime Data""" Data23 = crime.read.csv("/storage/home/njq5013/Project/Crimes_-_2023_20231016.csv", inferSchema = True, header = True) Data22.printSchema() DF_3 = Data23.select("Primary Type" , "Location Description") DF_3.take(10) """## Cleaning 2022 Crime Data""" cleandata23 = DF_3.filter(DF_3["Primary Type"].isNotNull()) cleandata23.show(10) """## Data Merging with 2022 - 2023""" Data22_23 = Data22.union(Data23) print(Data22.count()) print(Data23.count()) print(Data22.count() + Data23.count()) print(Data22_23.count()) Data22_23.printSchema() """## Selecting useful columns""" df = Data22_23.select("Date", "Block", "Primary Type", "Description", "Location Description", "Arrest", "Domestic", "Beat", "District", "Ward", "Community Area", "Year") """## Removing rows with null values""" df_clean = df.dropna(how = 'any') print(df.count()) print(df_clean.count()) """## INITIAL EDA AND MAPREDUCE""" mapped_primary_type = df_clean.rdd.map(lambda row: (row["Primary Type"], 1)) reduced_primary_type = mapped_primary_type.reduceByKey(lambda a, b: a + b) sorted_primary_type = reduced_primary_type.sortBy(lambda x: x[1], ascending=False) primary_type_counts_sorted = sorted_primary_type.collect() primary = [] #empty list for visualization counts = [] #empty list for visualization for primary_type, count in primary_type_counts_sorted: print(f"Primary Type: {primary_type} -> Count: {count}") primary.append(primary_type) #the first column (primary type) counts.append(int(count)) #the second column (counts) """## Visualization - Primary type and count"""
#bar plot of primary_type and counts plt.bar(primary, counts) #from above plt.xlabel('Primary Type') plt.ylabel('Count') plt.title('Primary Type Counts') plt.xticks(rotation = 90)#readability is bad without this, unless we switch the axis plt.show() #line plot - same thing as top plt.plot(primary, counts, marker = "o") #from above plt.xlabel('Primary Type') plt.ylabel('Count') plt.title('Primary Type Counts') plt.xticks(rotation = 90)#readability is bad without this, unless we switch the axis plt.show() # MapReduce to count occurrences based on multiple attributes # Map function to extract the required attributes and create key-value pairs def map_attributes(row): date_obj = datetime.strptime(row["Date"], "%m/%d/%Y %I:%M:%S %p") # Adjust date format if necessary month = date_obj.month year = row["Year"] key = (row["Primary Type"], row["Location Description"], month, year, row["Block"], row["Ward"], row["District"]) return (key, 1) # Map phase mapped_data = df_clean.rdd.map(map_attributes) # Reduce phase: Sum the values for each key reduced_data = mapped_data.reduceByKey(lambda a, b: a + b) # Sort the results in descending order based on count sorted_data = reduced_data.sortBy(lambda x: x[1], ascending=False) # Collect the sorted results aggregated_counts_sorted = sorted_data.collect() # Displaying the first few sorted results for verification for (primary_type, location_description, month, year, block, ward, district), count in aggregated_counts_sorted[:10]: print(f"Primary Type: {primary_type}, Location: {location_description}, Month: {month}, Year: {year}, Block: {block}, Ward: {ward}, District: {district} -> Count: {count}") # MapReduce to count occurrences for each type of crime in each ward mapped_rdd = df_clean.rdd.map(lambda row: ((row["Ward"], row["Primary Type"]), 1)) reduced_rdd = mapped_rdd.reduceByKey(lambda a, b: a + b) ward_crime_counts = reduced_rdd.collect() # Selecting a specific ward for visualization (e.g., Ward 10)
Your preview ends here
Eager to read complete document? Join bartleby learn and gain access to the full version
  • Access to all documents
  • Unlimited textbook solutions
  • 24/7 expert homework help