Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
151 commits
Select commit Hold shift + click to select a range
e1597ae
Code to read csv file
GayathriSrividya Aug 23, 2022
ec397d6
code to update user rating
GayathriSrividya Aug 23, 2022
376edf4
required packages
GayathriSrividya Aug 23, 2022
2bfe168
-S
GayathriSrividya Aug 23, 2022
269bc60
install required packages
GayathriSrividya Aug 23, 2022
2dabc71
install packages
GayathriSrividya Aug 23, 2022
0b7ffb3
python class to update user ratig
GayathriSrividya Aug 23, 2022
e23d808
required dependencies
GayathriSrividya Aug 23, 2022
65d9473
python class to update rating
GayathriSrividya Aug 23, 2022
33b8beb
python code to read and update csv file
GayathriSrividya Aug 23, 2022
0970255
required dependencies
GayathriSrividya Aug 23, 2022
9fb4a4e
Delete ratings_class.py
RajavarapuGayathri Aug 23, 2022
b6f15b6
csv file
GayathriSrividya Aug 23, 2022
719d4b4
configuration file to connect postgreSQL database
Aug 29, 2022
977bf45
python code to convert csv file to postgreSQL database
Aug 29, 2022
a119859
gitignore commit
Aug 29, 2022
456582a
python code to convert csv file to postgreSQL database
Aug 29, 2022
2dc8db4
configurations for postgreSQL server
Aug 29, 2022
fe17920
gitignore commit
Aug 29, 2022
4930c0f
configurations for postgreSQL server
Sep 1, 2022
6b3083e
python code to convert csv to postgresql table
Sep 1, 2022
396ea23
python class to handle CRUD functions
Sep 1, 2022
a89d470
gitignore commit
Sep 1, 2022
21c8bb3
gitignore commit
Sep 1, 2022
ecad137
Merge branch 'new-user-rating' of github.com:GayathriSrividya/trainin…
Sep 1, 2022
66883d7
configuration file for postgreSQL
Sep 1, 2022
d924ff2
gitignore commit
Sep 1, 2022
17f75fc
python code to convert csv file to postgresql database
Sep 1, 2022
5a3f74f
configuration file for postgreSQL
Sep 1, 2022
b605091
python code to convert dcsv file to postgresql table
Sep 1, 2022
efe2951
python code to convert csv file to postgresql table
Sep 1, 2022
c6a0bd8
python class to manage records of postgreSQL table
Sep 1, 2022
02ebc90
configuration file forpostgresql
Sep 1, 2022
a231559
initial commit
Sep 1, 2022
9fb68c3
configuration file for postgresql
Sep 1, 2022
e1ed18d
python code to convert csv file to pandas dataframe
Sep 1, 2022
2f17c85
python class to update dataframe attributes
Sep 1, 2022
c07c936
python class to manage records of postgreSQL table
Sep 1, 2022
2216b3e
required dependencies
Sep 1, 2022
df8b36b
add sql queries
Sep 2, 2022
e3af4a1
python class to manage records of postgreSQL table
Sep 2, 2022
1009360
python code to convert csv file to postgresql table
Sep 2, 2022
f6ab699
python class to manage records of postgreSQL table
Sep 2, 2022
231e2d5
python code to convert csv file to postgresql table
Sep 2, 2022
0ea2a9e
python class to manage records of postgreSQL table
Sep 2, 2022
96ed84f
python class to manage records of postgreSQL table
Sep 2, 2022
c3c3d44
initial commit
Sep 5, 2022
b078c44
initial commit
Sep 5, 2022
4e64ef2
python cod eto convert csv file to pandas dataframe
Sep 5, 2022
41f0224
python code to convert csv file to pandas dataframe
Sep 5, 2022
d49134b
python class to update dataframe attributes
Sep 5, 2022
55199cf
python code to convert csv file to postgreSQL table
Sep 5, 2022
d0dbed5
python class to manage records in postgreSQL table
Sep 5, 2022
57a5422
initial commit
Sep 5, 2022
d3dc6a3
python code
Sep 5, 2022
e15226e
initial commit
Sep 5, 2022
757759b
initial commit
Sep 5, 2022
2658591
configuration file for postgreSQL
Sep 5, 2022
5092fcc
python code to convert csv file to pandas dataframe
Sep 5, 2022
62dc983
pythoon code to convert csv file to postgreSQL table
Sep 5, 2022
cca64ce
python class to manage records of postgreSQL table
Sep 5, 2022
abd50d1
python class to manage records of postgreSQL table
Sep 5, 2022
a79dd95
README commit
Sep 5, 2022
e99dad8
README commit
Sep 5, 2022
3867a44
Update README.md
GayathriSrividya Sep 5, 2022
36d3209
Update README.md
GayathriSrividya Sep 5, 2022
a02acb6
Update README.md
GayathriSrividya Sep 5, 2022
9fa0b9d
Update README.md
GayathriSrividya Sep 5, 2022
aba7d62
Update README.md
GayathriSrividya Sep 5, 2022
a6d2620
configuration file for postgreSQL
Sep 5, 2022
00326a9
required dependencies
Sep 6, 2022
79d4f00
initial commit
Sep 6, 2022
84790a8
python code to convert pandas dataframe to postgreSQL database
Sep 6, 2022
fb907c0
python class to manage records of postgreSQL table
Sep 6, 2022
4a52f0f
Delete db_query.py
GayathriSrividya Sep 6, 2022
d47bcdb
Delete df_to_db.py
GayathriSrividya Sep 6, 2022
a35b5ea
python class to manage records of postgreSQL table
Sep 6, 2022
de0c8ff
unit test cases
Sep 7, 2022
04ffe6d
unittest cases
Sep 7, 2022
5b6dc9f
initial commit
Sep 7, 2022
94b653b
configuration file for postgreSQL
Sep 7, 2022
2fe7004
python code to convert csv file to pandas dataframe
Sep 7, 2022
e192c73
python class to manage records of postgreSQL table
Sep 7, 2022
4c7cb69
python code to convert pandas dataframe to postgreSQL database
Sep 7, 2022
da8ee8f
python class to manage records of postgreSQL table
Sep 7, 2022
90c8c42
python code to convert csv file to pandas dataframe
Sep 7, 2022
fd11897
python class to manage records of postgreSQL table
Sep 7, 2022
b735bb7
required dependencies
Sep 7, 2022
cf5860a
python class to update dataframe attributes
Sep 7, 2022
b4d4c3d
initial commit
Sep 7, 2022
e212455
required dependencies
Sep 7, 2022
4342813
unittest cases
Sep 7, 2022
7d169d4
project setup
Sep 9, 2022
9e246a4
required dependencies
Sep 9, 2022
cedc479
csv to postgresql using pandas
Sep 9, 2022
a2b739e
updated .gitignore
Sep 9, 2022
6774beb
configuration file for postgreSQL
Sep 9, 2022
7eacd81
params for testv1.py file
Sep 9, 2022
d3d9a3f
params for testv2.py file
Sep 9, 2022
2f33c5e
uniitest cases
Sep 9, 2022
025ace5
uniitest cases
Sep 9, 2022
762908a
json file with sql queries
Sep 9, 2022
f31796a
python code to convert csv file to pandas dataframe
Sep 9, 2022
dd90615
python class to update dataframe attributes
Sep 9, 2022
6cc39c6
python class to manage records of postgreSQL table
Sep 9, 2022
3b2a826
python code to convert pandas dataframe to postgreSQL database
Sep 9, 2022
68cdfb1
Delete test_query.json
GayathriSrividya Sep 9, 2022
5dd5340
Delete v1params.json
GayathriSrividya Sep 9, 2022
69142d4
Delete v2params.json
GayathriSrividya Sep 9, 2022
c75dc74
Delete update_ratings.py
GayathriSrividya Sep 9, 2022
da136cd
Delete ratings_class.py
GayathriSrividya Sep 9, 2022
06e149d
Delete dataframe.py
GayathriSrividya Sep 9, 2022
2c71eb2
Delete records_class.py
GayathriSrividya Sep 9, 2022
c99cc0c
python code to convert pandas dataframe to postgreSQL database
Sep 9, 2022
4deb04d
json file with SQL queries
Sep 9, 2022
8542d3c
python code to convert pandas dataframe to postgreSQL database
Sep 9, 2022
2f3c9ba
python class to update dataframe attributes
Sep 9, 2022
e84742c
python code to convert pandas dataframe to postgreSQL database
Sep 9, 2022
08c2890
project setup
Sep 14, 2022
47fba28
json file with SQL queries
Sep 14, 2022
1e7c08e
json file with pandas queries
Sep 14, 2022
908642f
python code to convert csv file to pandas dataframe
Sep 14, 2022
694dc46
python code to convert cav file to postgreSQL database
Sep 14, 2022
f83d689
python class to manage records of postgreSQL table
Sep 14, 2022
6536207
python code to convert and manipulate csv file into pandas dataframe
Sep 14, 2022
9bb21ec
python class to perform CRUD operations on pandas dataframe
Sep 14, 2022
87c695b
params for unittest cases
Sep 14, 2022
9f8fcda
unittest cases
Sep 14, 2022
03f28fb
python class to manage records in postgreSQL table
Sep 14, 2022
1c4bfd6
python class to manage records in postgreSQL table
Sep 14, 2022
e35e13f
updated gitignore
Sep 14, 2022
3775f2b
updated gitignore
Sep 14, 2022
27e8c80
updated gitignore
Sep 14, 2022
d9fad1f
updated gitignore
Sep 14, 2022
6613b86
updated gitignore
Sep 14, 2022
b72183e
updated gitignore
Sep 14, 2022
abe3c47
updated gitignore
Sep 14, 2022
20f902d
updated gitignore
Sep 14, 2022
fd66e54
updated gitignore
Sep 14, 2022
cd75670
removing coverage file
Sep 14, 2022
29ef2ec
python code to convert and manipulate csv file into pandas dataframe
Sep 14, 2022
930401e
unittest cases
Sep 14, 2022
e1ef904
unittest cases
Sep 14, 2022
2947565
params for unittest cases
Sep 14, 2022
61aeee8
python code to convert and manipulate csv file into pandas dataframe
Sep 14, 2022
5bcc493
querying on data with pyspark
Sep 22, 2022
49bdb9f
pyspark queries
Sep 22, 2022
bbd1512
updated gitignore
Sep 22, 2022
3a88166
querying on data with pyspark
Sep 22, 2022
c403122
pyspark queries
Sep 22, 2022
6994d7d
required dependencies
Sep 22, 2022
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 7 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
.coverage
database.ini
my-project-env
__pycache__
__init__.py
htmlcov/
Dockerfile
133 changes: 133 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -28,3 +28,136 @@ Const,Your Rating,Date Rated,Title,URL,Title Type,IMDb Rating,Runtime (mins),Yea
* Every commit should be GPG signed
* Create a Pull Request with updated code

###########################################

this repo contains following folders

data:
----
data consists of ratings.csv file

config:
------
read_queries.json // json file that consists of SQL queries
pandas_queries.json // json file consists of pandas commands

utils:
-----

dbconfig.py // python file used for configuration of postgreSQL server

src:
----
src folder consists of python programs that retrieve and manipulate values in csv file using
pandas and postgreSQL

src->v1
-------
1)csv2pandas.py ---- this file has set of instructions that converts csv file to pandas dataframe, creates instance for the class Ratings to retrieve and update user rating

2)ratings.py ----- this python class contains dataframe values as attributes also consists of methods get_rating(), set_rating() to update user rating along with current date

src->v2
------
1)csv2db.py ---- converts csv file to postgreSQL using pandas, creating instance for the class Ratings to manipulate columns in the
postgreSQL table

2)ratings.py ---- consists of python class Ratings, involves different methods
to create a table and also perform CRUD(create, read, update, delete) operations on the table

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

add steps on how to set up the project and how to execute each script.

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

why is commit message saying initial commit?


src->v3
------
1)pandasql.py ---- converts csv file pandas Dataframe, creating instance for the class Ratings to manipulate columns in the Dataframe

2)ratings.py ---- consists of python class Ratings, involves different methods
to create a table and also perform CRUD(create, read, update, delete) operations on the Dataframe

tests:
------

this folder contains unittest files to check functionality of python codes existing in src folder, and also contains json files containing test cases.

tests->config
-------------

this folder consists of json files which have parameters for the test cases

setting up github repository:
----------------------------

Before you start working on the project, create your own github repository and generate SSH, GPG keys for authentication.
for more information, refer below:


https://docs.github.com/en/get-started/quickstart/create-a-repo

https://docs.github.com/en/authentication/connecting-to-github-with-ssh

https://docs.github.com/en/authentication/managing-commit-signature-verification


create python virtual environment in your linux system
-----------------------------------------------------

run python -V (if version is not displayed run sudo apt install python3)

after installing python, run "python -m venv my-project-env"

then virtual environment named my-project-env will be created.

run "source my-project-env/bin/activate" to activate

run "pip install requests" & "python -c "import requests"" only for the first time.

to close the virtual environment, type "deactivate"


install postgres in linux:
--------------------------

type the following command

"sudo apt-get install postgresql"

after successful installation, connect to postgresql using
"sudo -i -u postgres"

the user will now switched to postgres, type "psql" to connect with database server or to return to regular user type "exit" or press ctrl+d

postgres@user:~$ psql

postgres=# // here you can create different databases and manage tables

to create a database
postgres=# "create database my_database;"

connect to database "my_database"
postgres=# \c my_database

to exit type \q or ctrl+d


setup:
-----
install required dependencies in requirements.txt file

"pip install -r requirements.txt"

create a new directory using "mkdir dir_name"

to navigate into directory use "cd path/to/dir_name"

create a new file (say python file) use "touch file.py"

to execute a python script, use command "python file.py" or "python path/to/file.py"

Generate test coverage:
-----------------------

type the following commands in the terminal to generate test coverage report

"coverage run -m unittest discover"

"coverage report"

"coverage html"
11 changes: 11 additions & 0 deletions config/panda_queries.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
{
"List number of rows in a dataframe": "print(self.ratings.shape[0])",
"List number of rows by title type": "print(self.ratings['Title Type'].value_counts())",
"List number of rows by year of rating": "print(pd.DatetimeIndex(self.ratings['Date Rated']).year.value_counts())",
"List number of rows by year of release": "print(self.ratings['Year'].value_counts())",
"List top 10 rated titles": "print(self.ratings[['Title', 'Your Rating']].sort_values(by='Your Rating', ascending=False)[:10])",
"List bottom 10 rated titles": "print(self.ratings[['Title', 'Your Rating']].sort_values(by='Your Rating')[:10])",
"Fetch histogram of ratings": "print(self.ratings['Your Rating'].value_counts())",
"List genres by their average ratings, sorted decrementally: ": "self.ratings['Genres']=self.ratings['Genres'].str.split(', ')\nprint(self.ratings.explode('Genres').groupby('Genres')[['Genres', 'Your Rating']].mean().round(decimals=2).sort_values(by='Your Rating', ascending=False))",
"Filter rows using const id": "print(self.ratings.loc[self.ratings.Const==id])"
}
11 changes: 11 additions & 0 deletions config/pyspark_queries.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
{
"List number of rows in a dataframe": "print(ratings.count())",
"List number of rows by title type": "ratings.groupBy(ratings['Title Type']).count().sort(desc('count')).show()",
"List number of rows by year of rating": "ratings.withColumn('Year Rated', year(to_date(col('Date Rated'), 'MM/dd/yyyy'))).groupBy('Year Rated').count().sort(desc('count')).show()",
"List number of rows by year of release": "ratings.groupBy(ratings.Year).count().sort(asc('Year')).show()",
"List top 10 rated titles": "ratings.select([ratings['Title'], ratings['Your Rating']]).sort(desc('Your Rating')).show(10)",
"List bottom 10 rated titles": "ratings.select([ratings['Title'], ratings['Your Rating']]).sort(asc('Your Rating')).show(10)",
"Fetch histogram of ratings": "ratings.groupBy(ratings['Your rating']).count().sort(asc('Your Rating')).show()",
"List genres by their average ratings, sorted decrementally: ": "ratings.withColumn('Genres', explode_outer(split('Genres', ', ')).alias('Genres')).groupBy('Genres').agg(round(mean('Your Rating'), 2).alias('Average Rating')).sort(desc('Average Rating')).show(26)",
"Filter rows using const id": "ratings.where(ratings.Const==input('Enter Const id: ')).show()"
}
12 changes: 12 additions & 0 deletions config/read_queries.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
{
"List total number of records in table: ": "SELECT COUNT(*) FROM ratings;",
"List number of records by title type: ": "SELECT title_type, COUNT(*) FROM ratings GROUP BY title_type; " ,
"List number of records by year of rating: ": "SELECT EXTRACT(YEAR FROM date_rated) AS year_of_rating, COUNT(*) FROM ratings GROUP BY 1 ORDER BY 1;",
"List number of records by year of release: ": "SELECT year, COUNT(*) FROM ratings GROUP BY 1 ORDER BY 1;",
"List the top 10 rated titles: ": "SELECT title, your_rating FROM ratings ORDER BY your_rating DESC LIMIT 10;",
"List the bottom 10 rated titles: ": "SELECT title, your_rating FROM ratings ORDER BY your_rating LIMIT 10;",
"List genres by their average ratings, sorted decrementally: ": "SELECT genre, CAST(AVG(your_rating) AS DECIMAL(5,2)) AS average_rating FROM (SELECT DISTINCT(UNNEST(STRING_TO_ARRAY(genres, ', '))) AS genre, your_rating FROM ratings) AS temp GROUP BY genre ORDER BY average_rating DESC;",
"Fetch histogram of ratings: ": "SELECT your_rating, COUNT(*) FROM ratings GROUP BY 1 ORDER BY 1;",
"Filter Records from table using Const": "SELECT * FROM ratings WHERE Const=%s"

}
15 changes: 15 additions & 0 deletions pyspark_sql/csv2pyspark.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
import json
from pyspark.sql import SparkSession
from pyspark.sql.functions import *

spark=SparkSession.builder.appName("pyspark_sql").getOrCreate()
spark.sql("set spark.sql.legacy.timeParserPolicy=LEGACY")
ratings=spark.read.csv('../data/ratings.csv', header=True, inferSchema=True)
pyspark_queries=open('../config/pyspark_queries.json')
query_data=json.load(pyspark_queries)

# executing pyspark queries one by one

for query, pdtxt in query_data.items():
print(query+':\n')
exec(pdtxt)
8 changes: 8 additions & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
pandas==1.4.4
datetime==4.5
psycopg2==2.9.3
unittest2==1.1.0
coverage==6.4.4
postgres==4.0
pyspark==3.3.0
py4j==0.10.9.5
17 changes: 17 additions & 0 deletions src/v1/csv2pandas.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
# import necessary packages
import pandas as pd
from ratings import Ratings
# Read the csv file
# Load the data in csv file into a Data Frame

csv_file=pd.read_csv('../../data/ratings.csv', encoding='latin')

# iterating over the DataFrame rows using df.iterrows()

for index, row in csv_file[0:10].iterrows():

#creating instance for class Ratings

this_movie =Ratings(row['Const'], row['Your Rating'], row['Date Rated'], row['Title'], row['URL'], row['Title Type'], row['IMDb Rating'], row['Runtime (mins)'], row['Year'], row['Genres'], row['Num Votes'], row['Release Date'], row['Directors'])
new_rating=input("Enter your rating for "+row['Title']+" : ")
this_movie.set_rating(new_rating)
119 changes: 119 additions & 0 deletions src/v1/ratings.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,119 @@
# importing necessary modules
from datetime import datetime

class Ratings:
'''
Description of class Ratings

This is a class to update user rating and date rated in pandas dataframe

atrributes:
-----------
const //contains unique id that represents movie
your_rating //user rating for the movie
date_rated //date in which user rating is last updated
title //movie title
url //url contains link to access the resource
title_type //type of movie (eg: movie, short)
imdb_rating //average rating for the movie
runtime //total duration
year //year released
genres //category of film
num_votes //number of votes given
release_date
directors //list of directors

methods defined here:
---------------------
get_rating(self)
returns your_rating value

set_rating(self, new_rating)
parameters:
new_rating

updates value in your_rating to new_rating
updates date_rated to current date

successful updation depends upon the range in which new_rating lies (0 to 10)

'''
def __init__(self, const, your_rating, date_rated, title, url, title_type, imdb_rating, runtime, year, genres, num_votes, release_date, directors):
'''
Default Constructor for Ratings class

parameters:
-----------
const
your_rating
date_rated
title
url
title_type
imdb_rating
runtime
year
genres
num_votes
release_date
directors
'''
self.const=const
self.your_rating=your_rating
self.date_rated=date_rated
self.title=title
self.url=url
self.title_type=title_type
self.imdb_rating=imdb_rating
self.runtime=runtime
self.year=year
self.genres=genres
self.num_votes=num_votes
self.release_date=release_date
self.directors=directors

def get_rating(self):
'''
Summary Line
Extended Description of get_rating(self)

this method is used to retrieve user rating
'''
return self.your_rating

def set_rating(self, new_rating):
'''
Summary Line
Extended Description of set_rating(self, new_rating)

parameters:
----------
new_rating //input
your_rating
date_rated

this method is used to update user rating with the input value and also updates the date rated to current date
the current date will be generated by using folowing commands
day=datetime.now()
day=day.strftime("%d/%m/%Y")

updation will be done only if the input is in valid range i.e, between 0 and 10.0
stops execution otherwise

'''

try:
new_rating=float(new_rating)
if(new_rating>=0 and new_rating<=10.0):
self.your_rating=new_rating
day=datetime.now()
day=day.strftime("%d/%m/%Y")
self.date_rated=day
print("rating updated sucessfully for title {0}\n".format(self.title))
else:
print("\ninvalid input!! must be in between 0 and 10!! \nrating not updated\n")
return -1
except:
print("\ninvalid input!! must be in between 0 and 10!! \nrating not updated\n")
return -1

44 changes: 44 additions & 0 deletions src/v2/csv2db.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
# importing required modules
import json

import pandas as pd

from ratings import Ratings

# reading csv file

print("reading file ratings.csv...\n")
print("converting csv file into pandas dataframe...\n")
df=pd.read_csv("../../data/ratings.csv", encoding='latin')
df['Date Rated'] = pd.to_datetime(df['Date Rated'])
print("csv file is successfully converted into dataframe...\n")

# creating instance for Ratings class

this_title=Ratings()

print("inserting values into database...\n")

for index, row in df.iterrows():
this_title.insert(row)

print("Values are inserted into the database Successfully. \n")


# reading json file contains sql queries

read_queries= open('../../config/read_queries.json')
query_data = json.load(read_queries)

# executing sql queries one by one

for query, sqltext in query_data.items():
print(query+"\n")
if "%s" in sqltext:
val = input("Enter Const id of title: ")
this_title.read(sqltext, val)
else:
this_title.read(sqltext)

this_title.update("tt1001526", 7.23)
this_title.delete("tt1001526")
Loading