吉吉于

【ResSys】SVD

import math
import random
import cPickle as pickle


#calculate the overall average
def Average(fileName):
	fi = open(fileName, 'r')
	result = 0.0
	cnt = 0
	for line in fi:
		cnt += 1
		arr = line.split()
		result += int(arr[2].strip())
	return result / cnt



def InerProduct(v1, v2):
	result = 0
	for i in range(len(v1)):
		result += v1[i] * v2[i]

	return result


def PredictScore(av, bu, bi, pu, qi):
	pScore = av + bu + bi + InerProduct(pu, qi)
	if pScore < 1:
		pScore = 1
	elif pScore > 5:
		pScore = 5

	return pScore


def SVD(configureFile, testDataFile, trainDataFile, modelSaveFile):
	#get the configure
	fi = open(configureFile, 'r')
	line = fi.readline()
	arr = line.split()
	averageScore = float(arr[0].strip())
	userNum = int(arr[1].strip())
	itemNum = int(arr[2].strip())
	factorNum = int(arr[3].strip())
	learnRate = float(arr[4].strip())
	regularization = float(arr[5].strip())
	fi.close()

	bi = [0.0 for i in range(itemNum)]
	bu = [0.0 for i in range(userNum)]
	temp = math.sqrt(factorNum)
	qi = [[(0.1 * random.random() / temp) for j in range(factorNum)] for i in range(itemNum)]
	pu = [[(0.1 * random.random() / temp)  for j in range(factorNum)] for i in range(userNum)]
	print("initialization end\nstart training\n")

	#train model
	preRmse = 1000000.0
	for step in range(100):
		fi = open(trainDataFile, 'r')
		for line in fi:
			arr = line.split()
			uid = int(arr[0].strip()) - 1
			iid = int(arr[1].strip()) - 1
			score = int(arr[2].strip())
			prediction = PredictScore(averageScore, bu[uid], bi[iid], pu[uid], qi[iid])

			eui = score - prediction

			#update parameters
			bu[uid] += learnRate * (eui - regularization * bu[uid])
			bi[iid] += learnRate * (eui - regularization * bi[iid])
			for k in range(factorNum):
				temp = pu[uid][k]	#attention here, must save the value of pu before updating
				pu[uid][k] += learnRate * (eui * qi[iid][k] - regularization * pu[uid][k])
				qi[iid][k] += learnRate * (eui * temp - regularization * qi[iid][k])
		fi.close()
		#learnRate *= 0.9
		curRmse = Validate(testDataFile, averageScore, bu, bi, pu, qi)
		print("test_RMSE in step %d: %f" %(step, curRmse))
		if curRmse >= preRmse:
			break
		else:
			preRmse = curRmse

	#write the model to files
	fo = file(modelSaveFile, 'wb')
	pickle.dump(bu, fo, True)
	pickle.dump(bi, fo, True)
	pickle.dump(qi, fo, True)
	pickle.dump(pu, fo, True)
	fo.close()
	print("model generation over")

#validate the model
def Validate(testDataFile, av, bu, bi, pu, qi):
	cnt = 0
	rmse = 0.0
	fi = open(testDataFile, 'r')
	for line in fi:
		cnt += 1
		arr = line.split()
		uid = int(arr[0].strip()) - 1
		iid = int(arr[1].strip()) - 1
		pScore = PredictScore(av, bu[uid], bi[iid], pu[uid], qi[iid])

		tScore = int(arr[2].strip())
		rmse += (tScore - pScore) * (tScore - pScore)
	fi.close()
	return math.sqrt(rmse / cnt)




#use the model to make predict
def Predict(configureFile, modelSaveFile, testDataFile, resultSaveFile):
	#get parameter
	fi = open(configureFile, 'r')
	line = fi.readline()
    import pdb;pdb.set_trace()
	arr = line.split()
	averageScore = float(arr[0].strip())
	fi.close()

	#get model
	fi = file(modelSaveFile, 'rb')
	bu = pickle.load(fi)
	bi = pickle.load(fi)
	qi = pickle.load(fi)
	pu = pickle.load(fi)
	fi.close()

	#predict
	fi = open(testDataFile, 'r')
	fo = open(resultSaveFile, 'w')
	for line in fi:
		arr = line.split()
		uid = int(arr[0].strip()) - 1
		iid = int(arr[1].strip()) - 1
		pScore = PredictScore(averageScore, bu[uid], bi[iid], pu[uid], qi[iid])
		fo.write("%f\n" %pScore)
	fi.close()
	fo.close()
	print("predict over")


if __name__ == '__main__':
	configureFile = 'svd.conf'
	trainDataFile = 'ml_data/training.txt'
	testDataFile = 'ml_data/test.txt'
	modelSaveFile = 'svd_model.pkl'
	resultSaveFile = 'prediction'

	#print("%f" %Average("ua.base"))
	SVD(configureFile, testDataFile, trainDataFile, modelSaveFile)
	#Predict(configureFile, modelSaveFile, testDataFile, resultSaveFile)

转载请注明:于哲的博客 » 【ResSys】SVD