注:users.csv非ml-latest数据集中的内容,是任课老师便于前端展示而随机生成的文件
由于ubuntu20.04最低支持mysql8,而mycat1与后端flask相连会因为utf8mb4的编码和mysql8废弃的方法导致无法解决的问题,遂尝试使用最新适配mysql8的mycat2。以下是本人的野人献曝。
tag = pd.read_csv('ml-latest/genome-scores.csv').dropna()
movie = pd.read_csv('ml-latest/movies.csv').dropna()
link = pd.read_csv('ml-latest/links.csv').dropna()
tag.movieId = tag.movieId.astype(int)
movie.movieId = movie.movieId.astype(int)
link.movieId = link.movieId.astype(int)
movieIds = set(tag.movieId).intersection(set(movie.movieId)).intersection(set(link.movieId))
omid2nmid={mid:idx for idx,mid in enumerate(movieIds)}
tag.movieId = tag.movieId.apply(lambda x:omid2nmid[x] if x in omid2nmid else np.nan)
tag = tag.dropna().sort_values(by=["movieId","tagId"]).groupby('movieId').apply(lambda df:pd.Series(df['relevance'].tolist()))
np.save('ml-latest/processed_tags.npy',tag.to_numpy())
toptag = np.argsort(-tag.to_numpy())[:,:3]
toptagrelevance = -np.sort(-tag.to_numpy())[:,:3]
user = pd.read_csv('ml-latest/users.csv',encoding='gbk').dropna()
rating = pd.read_csv('ml-latest/ratings.csv').dropna()
rating.movieId = rating.movieId.apply(lambda x:omid2nmid[x] if x in omid2nmid else np.nan)
rating.userId = rating.userId.apply(lambda x:np.nan if x>user.shape[0] else x)
rating.dropna(inplace=True)
ouid2nuid={oid:idx for idx,oid in enumerate(rating.userId.unique())}
rating.userId = rating.userId.apply(lambda x:ouid2nuid[x])
user.userId = user.userId.apply(lambda x:ouid2nuid[x] if x in ouid2nuid else np.nan)
user.dropna(inplace=True)
np.save('ml-latest/processed_users.npy',user.sort_values(by="userId").reset_index(drop=True).to_numpy())
np.save('ml-latest/processed_ratings.npy',rating.sort_values(by="userId").to_numpy())
genres = ["Action","Adventure","Animation","Children's","Comedy","Crime","Documentary","Drama","Fantasy","Film-Noir","Horror","Musical","Mystery","Romance","Sci-Fi","Thriller","War","Western"]
genres2id = {genre:i for i,genre in enumerate(genres)}
np.save('ml-latest/genres2id.npy',genres2id)
movie.movieId = movie.movieId.apply(lambda x:omid2nmid[x] if x in omid2nmid else np.nan)
movie = movie.dropna().sort_values(by="movieId")
link.movieId = link.movieId.apply(lambda x:omid2nmid[x] if x in omid2nmid else np.nan)
link = link.dropna().sort_values(by="movieId")
genre = np.array([''.join(['1' if genre in genre_list else '0' for genre in genres]) for genre_list in movie.genres.str.split('|').tolist()])
np.save('ml-latest/processed_movies.npy',np.concatenate([movie.to_numpy()[:,1:-1],genre.reshape(-1,1),link.to_numpy().astype(np.int64)[:,1:],toptag,toptagrelevance],axis=1))
.
├── bin #mycat2 执行文件
├── conf #mycat2 配置文件
│ ├── clusters #集群,用来配置数据库读写分离
│ │ ├── movielens_cluster.cluster.json #movielens结点
│ │ └── prototype.cluster.json #初始结点
│ ├── datasources #数据源,用来配置分布式
│ │ ├── movielens.datasource.json #movielens数据源
│ │ ├── backup.datasource.json #备份数据源
│ │ └── prototypeDs.datasource.json #初始数据源
│ ├── schemas #数据库模式,用来配置数据库分片
│ │ ├── movielens.schema.json #movielens数据库分片配置文件
│ │ ├── mycat.schema.json #mycat数据库
│ │ └── mysql.schema.json #初始数据库
│ ├── server.json #mycat2服务器(虚拟数据库)配置文件
│ ├── sql #一些sql语句便于了解mycat2,替换为自己的sql语句
│ │ ├── create_movie.sql #创建movie分片表
│ │ ├── create_rating.sql #创建rating分片表
│ │ ├── create_user.sql #创建user分片表
│ │ ├── genre.sql #创建genre表并导入数据
│ │ ├── movie.sql #导入movie数据
│ │ ├── rating.sql #导入rating数据
│ │ ├── tag.sql #创建tag表并导入数据
│ │ └── user.sql #导入user数据
│ ├── sqlcaches
│ ├── state.json #mycat2备份配置文件
│ ├── users
│ │ └── root.user.json #mycat2用户配置文件
├── lib #mycat2依赖包
└── logs #日志文件
├── mycat.pid
└── wrapper.log
{
"clusterType":"MASTER_SLAVE",
"heartbeat":{
"heartbeatTimeout":1000,
"maxRetryCount":3,
"minSwitchTimeInterval":300,
"slaveThreshold":0.0
},
"masters":[
"movielens"
],
"maxCon":2000,
"name":"movielens_cluster",
"readBalanceType":"BALANCE_ALL",
"replicas":[
"backup"
],
"switchType":"SWITCH"
}
{
"dbType":"mysql",
"idleTimeout":60000,
"initSqls":[],
"initSqlsGetConnection":true,
"instanceType":"READ_WRITE",
"maxCon":1000,
"maxConnectTimeout":3000,
"maxRetryCount":5,
"minCon":1,
"name":"movielens",
"password":"数据库密码",
"queryTimeout":30,
"type":"JDBC",
"url":"jdbc:mysql://127.0.0.1:3306/movielens?serverTimezone=UTC&useUnicode=true&characterEncoding=UTF-8&autoReconnect=true",
"user":"root",
"weight":0
}
create_user.sql
/*+ mycat:createTable{
"schemaName":"movielens",
"shardingTable":{
"createTableSQL":"CREATE TABLE movielens.user (\n\tuserid INTEGER NOT NULL,\n\tgender VARCHAR(10),\n\tname VARCHAR(255) NOT NULL,\n\tpassword VARCHAR(200) NOT NULL,\n\tcreated_on DATETIME,\n\tlast_login DATETIME,\n\tPRIMARY KEY (userid)\n)",
"partition":{
"data":[["movielens_cluster","movielens_1","user_1","0","0","0"],["movielens_cluster","movielens_2","user_2","1","1","1"],["movielens_cluster","movielens_3","user_3","2","2","2"]]
},
"function":{
"clazz":"io.mycat.router.mycat1xfunction.PartitionByMod",
"properties":{
"count":"3",
"columnName":"userid"
},
}
},
"tableName":"user"
} */;
create_movie.sql
/*+ mycat:createTable{
"schemaName":"movielens",
"shardingTable":{
"createTableSQL":"CREATE TABLE movielens.`movie` (\n\t`movieid` int NOT NULL,\n\t`title` varchar(255) NOT NULL,\n\t`genre` varchar(20) NOT NULL,\n\t`src` varchar(255) DEFAULT NULL,\n\t`imdb` varchar(20) NOT NULL,\n\t`tmdb` varchar(20) NOT NULL,\n\t`tagtop1` int NOT NULL,\n\t`tagtop2` int NOT NULL,\n\t`tagtop3` int NOT NULL,\n\t`relevance1` float NOT NULL,\n\t`relevance2` float NOT NULL,\n\t`relevance3` float NOT NULL,\n\tPRIMARY KEY (`movieid`)\n)",
"partition":{
"data":[["movielens_cluster","movielens_1","movie_1","0","0","0"],["movielens_cluster","movielens_2","movie_2","1","1","1"],["movielens_cluster","movielens_3","movie_3","2","2","2"]]
},
"function":{
"clazz":"io.mycat.router.mycat1xfunction.PartitionByMod",
"properties":{
"count":"3",
"columnName":"movieid"
},
}
},
"tableName":"movie"
} */;
create_rating.sql
/*+ mycat:createTable{
"schemaName":"movielens",
"shardingTable":{
"createTableSQL":"CREATE TABLE movielens.rating (\n\tratingid INTEGER NOT NULL,\n\tuserid INTEGER NOT NULL,\n\tmovieid INTEGER NOT NULL,\n\trating FLOAT NOT NULL,\n\tcreated_on DATETIME,\n\tPRIMARY KEY (ratingid)\n)",
"partition":{
"data":[["movielens_cluster","movielens_1","rating_1","0","0","0"],["movielens_cluster","movielens_2","rating_2","1","1","1"],["movielens_cluster","movielens_3","rating_3","2","2","2"]]
},
"function":{
"clazz":"io.mycat.router.mycat1xfunction.PartitionByMod",
"properties":{
"count":"3",
"columnName":"userid"
},
}
},
"tableName":"rating"
} */;
执行sql
mysql -uroot -p -Dmovielens -h127.0.0.1 -P8066
mysql>source create_user.sql;
mysql>source create_movie.sql;
mysql>source create_rating.sql;
mysql>exit;
最终形成的配置文件
{
"customTables":{},
"globalTables":{},
"normalTables":{
"genre":{
"createTableSQL":"CREATE TABLE movielens.`genre` (\n\t`genreid` int NOT NULL AUTO_INCREMENT,\n\t`genre` varchar(50) NOT NULL,\n\tPRIMARY KEY (`genreid`)\n)",
"locality":{
"schemaName":"movielens",
"tableName":"genre",
"targetName":"prototype"
}
},
"tag":{
"createTableSQL":"CREATE TABLE movielens.`tag` (\n\t`tagid` int NOT NULL AUTO_INCREMENT,\n\t`tag` varchar(100) DEFAULT NULL,\n\tPRIMARY KEY (`tagid`)\n)",
"locality":{
"schemaName":"movielens",
"tableName":"tag",
"targetName":"prototype"
}
}
},
"schemaName":"movielens",
"shardingTables":{
"movie":{
"createTableSQL":"CREATE TABLE movielens.`movie` (\n\t`movieid` int NOT NULL,\n\t`title` varchar(255) NOT NULL,\n\t`genre` varchar(20) NOT NULL,\n\t`src` varchar(255) DEFAULT NULL,\n\t`imdb` varchar(20) NOT NULL,\n\t`tmdb` varchar(20) NOT NULL,\n\t`tagtop1` int NOT NULL,\n\t`tagtop2` int NOT NULL,\n\t`tagtop3` int NOT NULL,\n\t`relevance1` float NOT NULL,\n\t`relevance2` float NOT NULL,\n\t`relevance3` float NOT NULL,\n\tPRIMARY KEY (`movieid`)\n)",
"function":{
"clazz":"io.mycat.router.mycat1xfunction.PartitionByMod",
"properties":{
"count":"3",
"columnName":"movieid"
},
"ranges":{}
},
"partition":{
"data":[
[
"movielens_cluster",
"movielens_1",
"movie_1",
"0",
"0",
"0"
],
[
"movielens_cluster",
"movielens_2",
"movie_2",
"1",
"1",
"1"
],
[
"movielens_cluster",
"movielens_3",
"movie_3",
"2",
"2",
"2"
]
]
},
"shardingIndexTables":{}
},
"rating":{
"createTableSQL":"CREATE TABLE movielens.rating (\n\tratingid INTEGER NOT NULL,\n\tuserid INTEGER NOT NULL,\n\tmovieid INTEGER NOT NULL,\n\trating FLOAT NOT NULL,\n\tcreated_on DATETIME,\n\tPRIMARY KEY (ratingid)\n)",
"function":{
"clazz":"io.mycat.router.mycat1xfunction.PartitionByMod",
"properties":{
"count":"3",
"columnName":"userid"
},
"ranges":{}
},
"partition":{
"data":[
[
"movielens_cluster",
"movielens_1",
"rating_1",
"0",
"0",
"0"
],
[
"movielens_cluster",
"movielens_2",
"rating_2",
"1",
"1",
"1"
],
[
"movielens_cluster",
"movielens_3",
"rating_3",
"2",
"2",
"2"
]
]
},
"shardingIndexTables":{}
},
"user":{
"createTableSQL":"CREATE TABLE movielens.user (\n\tuserid INTEGER NOT NULL,\n\tgender VARCHAR(10),\n\tname VARCHAR(255) NOT NULL,\n\tpassword VARCHAR(200) NOT NULL,\n\tcreated_on DATETIME,\n\tlast_login DATETIME,\n\tPRIMARY KEY (userid)\n)",
"function":{
"clazz":"io.mycat.router.mycat1xfunction.PartitionByMod",
"properties":{
"count":"3",
"columnName":"userid"
},
"ranges":{}
},
"partition":{
"data":[
[
"movielens_cluster",
"movielens_1",
"user_1",
"0",
"0",
"0"
],
[
"movielens_cluster",
"movielens_2",
"user_2",
"1",
"1",
"1"
],
[
"movielens_cluster",
"movielens_3",
"user_3",
"2",
"2",
"2"
]
]
},
"shardingIndexTables":{}
}
},
"views":{}
}