This page is part of the documentation for the Machine Learning Database.
It is a static snapshot of a Notebook which you can play with interactively by trying MLDB online now.
It's free and takes 30 seconds to get going.
The MovieLens 20M dataset contains 20 million user ratings from 1 to 5 of thousands of movies. In this demo we'll build a simple recommendation system which will use this data to suggest 25 movies based on a seed movie you provide.
The notebook cells below use pymldb
's Connection
class to make REST API calls. You can check out the Using pymldb
Tutorial for more details.
from pymldb import Connection
mldb = Connection()
We'll start by using some command-line tools to download and decompress the data.
%%bash
mkdir -p /mldb_data/data
curl "http://public.mldb.ai/ml-20m.zip" 2>/dev/null > /mldb_data/data/ml-20m.zip
unzip /mldb_data/data/ml-20m.zip -d /mldb_data/data
%%bash
head /mldb_data/data/ml-20m/README.txt
%%bash
head /mldb_data/data/ml-20m/ratings.csv
See the Loading Data Tutorial guide for more details on how to get data into MLDB.
Here we load a text file and use the pivot
aggregator to create a sparse matrix representation of the ratings.
%%time
print mldb.put('/v1/procedures/import_mvlns', {
"type": "import.text",
"params": {
"dataFileUrl":"file:///mldb_data/data/ml-20m/ratings.csv",
"outputDataset": "mvlns_ratings_csv",
"runOnCreation": True
}
})
print mldb.put('/v1/procedures/process_mvlns', {
"type": "transform",
"params": {
"inputData": """
select pivot(movieId, rating) as *
named userId
from mvlns_ratings_csv
group by userId
""",
"outputDataset": "mvlns_ratings",
"runOnCreation": True
}
})
mldb.query("select * from mvlns_ratings limit 3")
print mldb.put('/v1/procedures/mvlns_svd', {
"type" : "svd.train",
"params" : {
"trainingData" : "select COLUMN EXPR (where rowCount() > 3) from mvlns_ratings",
"columnOutputDataset" : "mvlns_svd_embedding",
"modelFileUrl": "file://models/mvlns.svd",
"functionName": "mvlns_svd_embedder",
"runOnCreation": True
}
})
Our dataset has movieId
s but humans think about movie names so we'll load up the movie names in a dataset.
from ipywidgets import interact, interact_manual
from uuid import uuid4
print mldb.put('/v1/procedures/import_movies', {
"type": "import.text",
"params": {
"dataFileUrl":"file:///mldb_data/data/ml-20m/movies.csv",
"outputDataset": "movies",
"select": "title, movieId",
"named": "movieId",
"runOnCreation": True
}
})
A simple search function to find all movies (and corresponding movieId
s) whose names contain a string.
@interact
def movie_search(x = "toy story"):
return mldb.query("select title from movies where regex_match(lower(title), '.*%s.*')" % x.strip().lower())
Now let's create a dataset to hold user preferences, and a simple function to simulate a user rating movies they like and movies they dislike, based on the movie_search
function above.
print mldb.put("/v1/datasets/mvlns_user_prefs", {"type": "sparse.mutable"})
print mldb.put("/v1/functions/preferences", {
"type": "sql.query",
"params": {
"query": "select {*} as p from mvlns_user_prefs where rowName()=$user"
}
})
def save_prefs(user_id, likes, dislikes):
for rating, search_terms in zip([5,1],[likes, dislikes]):
for x in search_terms.split(","):
if len(x) > 3:
mldb.post("/v1/datasets/mvlns_user_prefs/rows", {
"rowName":user_id,
"columns": [[str(m), rating, 0] for m in movie_search(x).index]
})
mldb.post("/v1/datasets/mvlns_user_prefs/commit", {})
save_prefs("janedoe", "Toy Story", "Terminator")
mldb.query("select preferences({ user: 'janedoe' })[p] as *")
With all that done, we can now build a recommendation engine out of a simple SQL query by mapping a user's preferences into the same space as the movie embeddings (i.e. embedding the user's preferences) and looking for the nearest movies.
print mldb.put("/v1/functions/nearest_movies", {
"type": "embedding.neighbors",
"params": {
"dataset": "mvlns_svd_embedding",
"defaultNumNeighbors": 25,
"columnName": "embedding"
}
})
print mldb.put("/v1/functions/recommendations", {
"type": "sql.query",
"params": {
"query": """
select nearest_movies({
coords: mvlns_svd_embedder({
row: preferences({ user: $user })[p]
})[embedding]
})[distances] as r
"""
}
})
Here's a simple function which lets you simulate the results of liking and disliking certain movies and getting back the resulting recommendations.
def recommend(likes="Toy Story, Terminator", dislikes="Star Trek"):
# here we simulate a new user saving these preferences
user_id = str(uuid4())
save_prefs(user_id, likes, dislikes)
# we can then run an SQL query to:
# - retrieve recommendations
# - transpose and join them to movies to get titles
# - exclude the already-rated movies from the result
return mldb.query("""
select m.title
named m.movieId
from
transpose(( select recommendations({ user: '%(user)s' }) )) as r
join movies as m on r.rowPathElement(2) = m.rowPathElement(0)
where m.movieId not in (keys of preferences({ user: '%(user)s' })[p])
order by r.result
""" % dict(user=user_id))
recommend(likes="Toy Story, Terminator", dislikes="Star Trek")
Here's an interactive form that lets you play with this function to see if you agree with the recommendations!
NOTE: the interactive part of this demo only works if you're running this Notebook live, not if you're looking at a static copy on http://docs.mldb.ai. See the documentation for Running MLDB.
interact_manual(recommend)
Check out the other Tutorials and Demos.