GayathriSrividya · GayathriSrividya · Aug 23, 2022 · Aug 23, 2022 · Aug 23, 2022 · Aug 23, 2022
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1,7 @@
+.coverage
+database.ini
+my-project-env
+__pycache__
+__init__.py 
+htmlcov/
+Dockerfile
diff --git a/README.md b/README.md
@@ -28,3 +28,136 @@ Const,Your Rating,Date Rated,Title,URL,Title Type,IMDb Rating,Runtime (mins),Yea
 * Every commit should be GPG signed
 * Create a Pull Request with updated code
 
+###########################################
+
+this repo contains following folders
+
+data:
+----
+  data consists of ratings.csv file
+
+config:
+------
+  read_queries.json // json file that consists of SQL queries
+  pandas_queries.json // json file consists of pandas commands
+
+utils:
+-----
+
+  dbconfig.py     // python file used for configuration of postgreSQL server
+
+src:
+----
+src folder consists of python programs that retrieve and manipulate values in csv file using
+pandas and postgreSQL
+
+src->v1
+-------
+1)csv2pandas.py ---- this file has set of instructions that converts csv file to pandas dataframe, creates instance for the class 				 	Ratings to retrieve and update user rating
+
+2)ratings.py ----- this python class contains dataframe values as attributes also 	consists of methods get_rating(), set_rating() to update user rating along with current date
+
+src->v2
+------
+1)csv2db.py  ---- converts csv file to postgreSQL using pandas, creating instance for the class Ratings to manipulate columns in the 
+		  postgreSQL table
+
+2)ratings.py ---- consists of python class Ratings, involves different methods 
+		  to create a table and also perform CRUD(create, read, update, delete) operations on the table 
+
+src->v3
+------
+1)pandasql.py  ---- converts csv file pandas Dataframe, creating instance for the class Ratings to manipulate columns in the Dataframe
+
+2)ratings.py ---- consists of python class Ratings, involves different methods 
+		  to create a table and also perform CRUD(create, read, update, delete) operations on the Dataframe
+
+tests:
+------
+
+this folder contains unittest files to check functionality of python codes existing in src folder, and also contains json files containing test cases.
+
+tests->config
+-------------
+
+this folder consists of json files which have parameters for the test cases
+
+setting up github repository:
+----------------------------
+
+Before you start working on the project, create your own github repository and generate SSH, GPG keys for authentication.
+for more information, refer below:
+
+
+https://docs.github.com/en/get-started/quickstart/create-a-repo
+
+https://docs.github.com/en/authentication/connecting-to-github-with-ssh
+
+https://docs.github.com/en/authentication/managing-commit-signature-verification
+
+
+create python virtual environment in your linux system
+----------------------------------------------------- 
+
+run python -V (if version is not displayed run sudo apt install python3)
+
+after installing python, run "python -m venv my-project-env" 
+
+then virtual environment named my-project-env will be created. 
+
+run "source my-project-env/bin/activate" to activate 
+
+run "pip install requests" & "python -c "import requests"" only for the first time.
+
+to close the virtual environment, type "deactivate"
+
+
+install postgres in linux:
+--------------------------
+
+type the following command 
+
+"sudo apt-get install postgresql"
+
+after successful installation, connect to postgresql using 
+"sudo -i -u postgres"
+
+the user will now switched to postgres, type "psql" to connect with database server or to return to regular user type "exit" or press ctrl+d
+
+postgres@user:~$ psql
+
+postgres=# // here you can create different databases and manage tables 
+
+to create a database
+postgres=# "create database my_database;"
+
+connect to database "my_database"
+postgres=# \c my_database
+
+to exit type \q or ctrl+d
+
+
+setup:
+-----
+install required dependencies in requirements.txt file
+
+"pip install -r requirements.txt"
+
+create a new directory using "mkdir dir_name"
+
+to navigate into directory use "cd path/to/dir_name"
+
+create a new file (say python file) use "touch file.py"
+
+to execute a python script, use command "python file.py" or "python path/to/file.py"
+
+Generate test coverage:
+-----------------------
+
+type the following commands in the terminal to generate test coverage report
+
+"coverage run -m unittest discover"
+
+"coverage report"
+
+"coverage html"
diff --git a/config/panda_queries.json b/config/panda_queries.json
@@ -0,0 +1,11 @@
+{
+    "List number of rows in a dataframe": "print(self.ratings.shape[0])",
+    "List number of rows by title type": "print(self.ratings['Title Type'].value_counts())",
+    "List number of rows by year of rating": "print(pd.DatetimeIndex(self.ratings['Date Rated']).year.value_counts())",
+    "List number of rows by year of release": "print(self.ratings['Year'].value_counts())",
+    "List top 10 rated titles": "print(self.ratings[['Title', 'Your Rating']].sort_values(by='Your Rating', ascending=False)[:10])",
+    "List bottom 10 rated titles": "print(self.ratings[['Title', 'Your Rating']].sort_values(by='Your Rating')[:10])",
+    "Fetch histogram of ratings": "print(self.ratings['Your Rating'].value_counts())",
+    "List genres by their average ratings, sorted decrementally: ": "self.ratings['Genres']=self.ratings['Genres'].str.split(', ')\nprint(self.ratings.explode('Genres').groupby('Genres')[['Genres', 'Your Rating']].mean().round(decimals=2).sort_values(by='Your Rating', ascending=False))",
+    "Filter rows using const id": "print(self.ratings.loc[self.ratings.Const==id])"
+}
diff --git a/config/pyspark_queries.json b/config/pyspark_queries.json
@@ -0,0 +1,11 @@
+{
+    "List number of rows in a dataframe": "print(ratings.count())",
+    "List number of rows by title type": "ratings.groupBy(ratings['Title Type']).count().sort(desc('count')).show()",
+    "List number of rows by year of rating": "ratings.withColumn('Year Rated', year(to_date(col('Date Rated'), 'MM/dd/yyyy'))).groupBy('Year Rated').count().sort(desc('count')).show()",
+    "List number of rows by year of release": "ratings.groupBy(ratings.Year).count().sort(asc('Year')).show()",
+    "List top 10 rated titles": "ratings.select([ratings['Title'], ratings['Your Rating']]).sort(desc('Your Rating')).show(10)",
+    "List bottom 10 rated titles": "ratings.select([ratings['Title'], ratings['Your Rating']]).sort(asc('Your Rating')).show(10)",
+    "Fetch histogram of ratings": "ratings.groupBy(ratings['Your rating']).count().sort(asc('Your Rating')).show()",
+    "List genres by their average ratings, sorted decrementally: ": "ratings.withColumn('Genres', explode_outer(split('Genres', ', ')).alias('Genres')).groupBy('Genres').agg(round(mean('Your Rating'), 2).alias('Average Rating')).sort(desc('Average Rating')).show(26)",
+    "Filter rows using const id": "ratings.where(ratings.Const==input('Enter Const id: ')).show()"
+}
diff --git a/config/read_queries.json b/config/read_queries.json
@@ -0,0 +1,12 @@
+{
+    "List total number of records in table: ": "SELECT COUNT(*) FROM ratings;",
+    "List number of records by title type: ": "SELECT title_type, COUNT(*) FROM ratings GROUP BY title_type; " ,
+    "List number of records by year of rating: ": "SELECT EXTRACT(YEAR FROM date_rated) AS year_of_rating, COUNT(*) FROM ratings GROUP BY 1 ORDER BY 1;",
+    "List number of records by year of release: ": "SELECT year, COUNT(*) FROM ratings GROUP BY 1 ORDER BY 1;",
+    "List the top 10 rated titles: ": "SELECT title, your_rating FROM ratings ORDER BY your_rating DESC LIMIT 10;",
+    "List the bottom 10 rated titles: ": "SELECT title, your_rating FROM ratings ORDER BY your_rating LIMIT 10;",
+    "List genres by their average ratings, sorted decrementally: ": "SELECT genre, CAST(AVG(your_rating) AS DECIMAL(5,2)) AS average_rating FROM (SELECT DISTINCT(UNNEST(STRING_TO_ARRAY(genres, ', '))) AS genre, your_rating FROM ratings) AS temp GROUP BY genre ORDER BY average_rating DESC;",
+    "Fetch histogram of ratings: ": "SELECT your_rating, COUNT(*) FROM ratings GROUP BY 1 ORDER BY 1;",
+    "Filter Records from table using Const": "SELECT * FROM ratings WHERE Const=%s"
+
+}
diff --git a/pyspark_sql/csv2pyspark.py b/pyspark_sql/csv2pyspark.py
@@ -0,0 +1,15 @@
+import json
+from pyspark.sql import SparkSession
+from pyspark.sql.functions import *
+
+spark=SparkSession.builder.appName("pyspark_sql").getOrCreate()
+spark.sql("set spark.sql.legacy.timeParserPolicy=LEGACY")
+ratings=spark.read.csv('../data/ratings.csv', header=True, inferSchema=True)
+pyspark_queries=open('../config/pyspark_queries.json')
+query_data=json.load(pyspark_queries)
+
+# executing pyspark queries one by one
+
+for query, pdtxt in query_data.items():
+    print(query+':\n')
+    exec(pdtxt)
diff --git a/requirements.txt b/requirements.txt
@@ -0,0 +1,8 @@
+pandas==1.4.4
+datetime==4.5
+psycopg2==2.9.3
+unittest2==1.1.0
+coverage==6.4.4
+postgres==4.0
+pyspark==3.3.0
+py4j==0.10.9.5
diff --git a/src/v1/csv2pandas.py b/src/v1/csv2pandas.py
@@ -0,0 +1,17 @@
+# import necessary packages
+import pandas as pd
+from ratings import Ratings
+# Read the csv file
+# Load the data in csv file into a Data Frame
+
+csv_file=pd.read_csv('../../data/ratings.csv', encoding='latin')
+
+# iterating over the DataFrame rows using df.iterrows()
+
+for index, row in csv_file[0:10].iterrows():
+
+        #creating instance for class Ratings
+
+        this_movie =Ratings(row['Const'], row['Your Rating'], row['Date Rated'], row['Title'], row['URL'], row['Title Type'], row['IMDb Rating'], row['Runtime (mins)'], row['Year'], row['Genres'], row['Num Votes'], row['Release Date'], row['Directors'])
+        new_rating=input("Enter your rating for "+row['Title']+" : ")
+        this_movie.set_rating(new_rating)
diff --git a/src/v1/ratings.py b/src/v1/ratings.py
@@ -0,0 +1,119 @@
+# importing necessary modules
+from datetime import datetime
+
+class Ratings:
+    '''
+    Description of class Ratings
+
+    This is a class to update user rating and date rated in pandas dataframe
+
+    atrributes:
+    -----------
+    const       //contains unique id that represents movie
+    your_rating //user rating for the movie
+    date_rated  //date in which user rating is last updated
+    title       //movie title
+    url         //url contains link to access the resource
+    title_type  //type of movie (eg: movie, short)
+    imdb_rating //average rating for the movie
+    runtime     //total duration
+    year        //year released
+    genres      //category of film
+    num_votes   //number of votes given
+    release_date
+    directors   //list of directors
+
+    methods defined here:
+    ---------------------
+    get_rating(self)
+        returns your_rating value
+
+    set_rating(self, new_rating)
+        parameters:
+         new_rating
+
+        updates value in your_rating to new_rating
+        updates date_rated to current date
+
+        successful updation depends upon the range in which new_rating lies (0 to 10)
+
+    '''
+    def __init__(self, const, your_rating, date_rated, title, url, title_type, imdb_rating, runtime, year, genres, num_votes, release_date, directors):
+        '''
+        Default Constructor for Ratings class
+
+        parameters:
+        -----------
+        const
+        your_rating
+        date_rated
+        title
+        url
+        title_type
+        imdb_rating
+        runtime
+        year
+        genres
+        num_votes
+        release_date
+        directors
+        '''
+        self.const=const
+        self.your_rating=your_rating
+        self.date_rated=date_rated
+        self.title=title
+        self.url=url
+        self.title_type=title_type
+        self.imdb_rating=imdb_rating
+        self.runtime=runtime
+        self.year=year
+        self.genres=genres
+        self.num_votes=num_votes
+        self.release_date=release_date
+        self.directors=directors
+
+    def get_rating(self):
+        '''
+        Summary Line 
+        Extended Description of get_rating(self)
+
+        this method is used to retrieve user rating 
+        '''
+        return self.your_rating
+
+    def set_rating(self, new_rating):
+        '''
+        Summary Line 
+        Extended Description of set_rating(self, new_rating)
+
+        parameters:
+        ----------
+        new_rating //input 
+        your_rating
+        date_rated
+
+        this method is used to update user rating with the input value and also updates the date rated to current date
+        the current date will be generated by using folowing commands
+            day=datetime.now()
+            day=day.strftime("%d/%m/%Y")
+
+        updation will be done only if the input is in valid range i.e, between 0 and 10.0
+        stops execution otherwise
+
+        '''
+
+        try:
+            new_rating=float(new_rating)
+            if(new_rating>=0 and new_rating<=10.0):
+                self.your_rating=new_rating
+                day=datetime.now()
+                day=day.strftime("%d/%m/%Y")
+                self.date_rated=day
+                print("rating updated sucessfully for title {0}\n".format(self.title))
+            else:
+                print("\ninvalid input!! must be in between 0 and 10!! \nrating not updated\n")
+                return -1
+        except:
+            print("\ninvalid input!! must be in between 0 and 10!! \nrating not updated\n")
+            return -1
+
diff --git a/src/v2/csv2db.py b/src/v2/csv2db.py
@@ -0,0 +1,44 @@
+# importing required modules
+import json
+
+import pandas as pd
+
+from ratings import Ratings
+
+# reading csv file 
+
+print("reading file ratings.csv...\n")
+print("converting csv file into pandas dataframe...\n")
+df=pd.read_csv("../../data/ratings.csv", encoding='latin')
+df['Date Rated'] = pd.to_datetime(df['Date Rated'])
+print("csv file is successfully converted into dataframe...\n")
+
+# creating instance for Ratings class
+
+this_title=Ratings()
+
+print("inserting values into database...\n")
+
+for index, row in df.iterrows():
+    this_title.insert(row)
+
+print("Values are inserted into the database Successfully. \n")
+
+
+# reading json file contains sql queries
+
+read_queries= open('../../config/read_queries.json')
+query_data = json.load(read_queries)
+
+# executing sql queries one by one
+
+for query, sqltext in query_data.items():
+    print(query+"\n")
+    if "%s" in sqltext:
+        val = input("Enter Const id of title: ")
+        this_title.read(sqltext, val)
+    else:
+        this_title.read(sqltext)
+
+this_title.update("tt1001526", 7.23)
+this_title.delete("tt1001526")