// This is a design template for the functions to be used in the semantic model // Semantic Embedding // Created by Xugang Ye on 09/16/15. // Updated by Xugang Ye on 05/08/16. // Copyright © 2016 Xugang Ye. All rights reserved. // #include #include #include #include #include #include #include #include #include // c's standard lib #include // c's standard io #include // c's string #include // c's math using namespace std; // Fundamental data structure 1: qd[][] // It's a lookup table, qd[i][j] stores the j-th doc's id of the i-th query // Note that qdCts[][], R[][], and relProb[][] are all based on the structure of qd[][] // Fundamental data structure 2: x2q[][], x1q[][], zq[][] // x2q[][] is the input features matrix for the queries, at the input layer, the entries of the first column are all 1's // x2q[i][j] is the j-th input feature of the i-th query, x2q[i] is the input feature vector of the i-th query // x1q[][] is the input features matrix for the queries, at the intermediate layer // x1q[i][j] is the j-th input feature of the i-th query, x1q[i] is the input feature vector of the i-th query // zq[][] is the mapped features matrix for the queries, there is no all 1's column // zq[i][j] is the j-th mapped feature of the i-th query, zq[i] is the mapped feature vector of the i-th query // Mapping sequence: x2q[][] -> x1q[][] -> zq[][] // Note that the id of the i-th query is just i // Fundamental data structure 3: x2d[][], x1d[][], zd[][] // x2d[][] is the input features matrix for the docs, at the input layer, the entries of the first column are all 1's // x2d[i][j] is the j-th input feature of the i-th doc, x2d[i] is the input feature vector of the i-th doc // x1d[][] is the input features matrix for the docs, at the intermediate layer // x1d[i][j] is the j-th input feature of the i-th doc, x1d[i] is the input feature vector of the i-th doc // zd[][] is the mapped features matrix for the docs, there is no all 1's column // zd[i][j] is the j-th mapped feature of the i-th doc, zd[i] is the mapped feature vector of the i-th doc // Mapping sequence: x2d[][] -> x1d[][] -> zd[][] // Note that the id of the i-th doc is just i // Fundamental data structure 4: W2q[][], W2d[][], W1q[][], W1d[][] // W2q[][]: the query part neural network mapping matrix, at the input layer, m by n (The first column is the bias column), m is mapped dimension, n is input dimension // W2d[][]: the doc part neural network mapping matrix, at the input layer, m by n (The first column is the bias column), m is mapped dimension, n is input dimension // Note that the input dimension of W2q[][] could be different from that of W2d[][] // W1q[][]: the query part neural network mapping matrix, at the intermediate layer, m by n, m is mapped dimension, n is input dimension // W1d[][]: the doc part neural network mapping matrix, at the intermediate layer, m by n, m is mapped dimension, n is input dimension // Note that the input dimension of W1q[][] could be different from that of W1d[][] // W2q[][] W1q[][] // Mapping sequence: x2q[][] -------> x1q[][] -------> zq[][] // W2d[][] W1d[][] // Mapping sequence: x2d[][] -------> x1d[][] -------> zd[][] // Notes on lookup details: // For qd[i][j], the query id is i, the doc id is d = qd[i][j] (not j) // So, qdCts[i][j] is the count of doc d under query i // R[i][j] is the similarity between query i and doc d // relProb[i][j] is the relevance probability P(doc d | query i) // Dot product double dotProdcut(vector &x, vector &y) { double result = 0.0; int n_x = x.size(); int n_y = y.size(); if (n_x == n_y) { int i; for (i = 0; i < n_x; i++) result += x[i]*y[i]; } return (result); } // 2-norm double twoNorm(vector &x) { double result = 0.0; int n_x = x.size(); if (n_x > 0) { int i; for (i = 0; i < n_x; i++) result += x[i]*x[i]; } result = sqrt(result); return (result); } // Cosine similarity of two vectors double cosineSim(vector &x, vector &y) { double result = 0.0; int n_x = x.size(); int n_y = y.size(); if ((n_x > 0) && (n_x == n_y)) result = dotProdcut(x,y) / (twoNorm(x)*twoNorm(y)); return (result); } // Vector-to-vector mapping (linear combination + activation) // Input: // x[]: input feature vector // W[][]: mapping matrix (The first column is the bias column) // output: // y[]: mapped vector void mappingVector_x2y(vector &x, vector> &W, vector &y) { int m = W.size(); // Output dimension, which should be as same as the dimension of y[] int n = W[0].size(); // Input dimension, which should be as same as the dimension of x[] int i,j; for (i = 0; i < m; i++) // for the i-th dimension of the mapped vector { y[i] = 0.0; for (j = 0; j < n; j++) // for the j-th dimension of the input vector y[i] += W[i][j]*x[j]; // Linear mapping y[i] = tanh(y[i]); // Activation by tanh } } // Matrix-to-matrix mapping (linear combination + activation) // Input: // x[][]: input feature matrix // W[][]: mapping matrix (The first column is the bias column) // output: // y[][]: mapped matrix void mappingMatrix_x2y(vector> &x, vector> &W, vector> &y) { int nRows = x.size(); // Number of rows of x[][] (and y[][]) int m = W.size(); // Output dimension, which should be as same as the dimension of y[] int n = W[0].size(); // Input dimension, which should be as same as the dimension of x[] int k,i,j; for (k = 0; k < nRows; k++) // for the k-th row (of x[][] and y[][]) { for (i = 0; i < m; i++) // for the i-th dimension of the mapped vector { y[k][i] = 0.0; for (j = 0; j < n; j++) // for the j-th dimension of the input vector y[k][i] += W[i][j]*x[k][j]; // Linear mapping y[k][i] = tanh(y[k][i]); // Activation by tanh } } } // This function is to calculate the vector of the 2-norms // Input: // z[][]: final features matrix (no bias) // Output: // z2norm[]: 2-norm vector of z[][] void vectorsOf2norms(vector> &z, vector &z2norm) { int m,n,i,j; // 2-norms of z[][] m = z.size(); // Number of queries (docs) n = z[0].size(); // Number of mapped features per query (doc) for (i = 0; i < m; i++) // for the i-th query (doc) { z2norm[i] = 0.0; for (j = 0; j < n; j++) // for the j-th mapped feature of the i-th query (doc) z2norm[i] += z[i][j]*z[i][j]; z2norm[i] = sqrt(z2norm[i]); } } // This function is to calculate the cosine similarity matrix // Input: // qd[][]: (query id, doc id) matrix, 10 columns by default // zq[][]: final query features matrix (no bias) // zd[][]: final doc features matrix (no bias) // zq2norm[]: 2-norm vector of zq[][] // zd2norm[]: 2-norm vector of zd[][] // Output: // R[][]: query-doc cosine similarity matrix, 10 columns by default void cosineSimMatrix(vector> &qd, vector> &zq, vector> &zd, vector &zq2norm, vector &zd2norm, vector> &R) { int m = qd.size(); // Number of queries int n = qd[0].size(); // Number of docs per query int i,j,d; for (i = 0; i < m; i++) // for the i-th query { // Calculate the cosine similarities for the i-th query for (j = 0; j < n; j++) // for the j-th doc under the i-th query { d = qd[i][j]; // doc id if (d >= 0) // If doc id is valid R[i][j] = dotProdcut(zq[i], zd[d]) / (zq2norm[i]*zd2norm[d]); } } } // This function is to calculate the relevance probability matrix // Input: // qd[][]: (query id, doc id) matrix, 10 columns by default // R[][]: query-doc cosine similarity matrix, 10 columns by default // gamma: smooth parameter, by default it equals 10 // Output: // relProb[][]: query-doc relevance probability matrix, 10 columns by default void relevanceProb(vector> &qd, vector> &R, double gamma, vector> &relProb) { int m = R.size(); // Number of queries int n = R[0].size(); // Number of docs per query int i,j,d; double normalizer; for (i = 0; i < m; i++) // for the i-th query { // Calculate the exponentials for the i-th query normalizer = 0.0; for (j = 0; j < n; j++) // for the j-th doc under the i-th query { d = qd[i][j]; // doc id if (d >= 0) // If doc id is valid { relProb[i][j] = exp(gamma*R[i][j]); normalizer += relProb[i][j]; } } // Normalization for (j = 0; j < n; j++) { d = qd[i][j]; // doc id if (d >= 0) // If doc id is valid relProb[i][j] /= normalizer; } } } // This function is to calculate the average per-(query-doc) loss given current model paramters // Added by Xugang on 2016-03-19 // Input: // qd[][]: (query id, doc id) matrix, 10 columns by default // qdCts[][]: (query id, doc count) matrix, 10 columns by default // sc_smoother: smooth factor for the count score // relProb[][]: query-doc relevance probability matrix, 10 columns by default // Output: // a real value double SemanticModelOneLayerV01_Loss(vector> &qd, vector> &qdCts, double sc_smoother, vector> &relProb) { double lossValue = 0.0; double deltaP = 1.0e-10; // Smoother int m = qd.size(); // Number of queries int n = qd[0].size(); // Number of docs per query int i,j,d; int num_qdpairs = 0; for (i = 0; i < m; i++) // for the i-th query { // Calculate the total loss for the i-th query for (j = 0; j < n; j++) { d = qd[i][j]; // doc id if (d >= 0) // If doc id is valid { lossValue -= (qdCts[i][j]+sc_smoother) * log(relProb[i][j] + deltaP); // n_ij * logp_ij num_qdpairs++; } } } lossValue /= num_qdpairs; return (lossValue); } // This is to calculate the perfect loss value (when p_ij -> n_ij/sum{n_ij: j}) double SemanticModel_perfLoss(vector> &qd, vector> &qdCts, double sc_smoother) { double lossValue = 0.0; double deltaP = 1.0e-10; // Smoother int m = qd.size(); // Number of queries int n = qd[0].size(); // Number of docs per query int i,j,d; int num_qdpairs = 0; int normalizer; for (i = 0; i < m; i++) // for the i-th query { // Calculate the total loss for the i-th query normalizer = 0; for (j = 0; j < n; j++) { d = qd[i][j]; // doc id if (d >= 0) // If doc id is valid normalizer += qdCts[i][j]; } for (j = 0; j < n; j++) { d = qd[i][j]; // doc id if (d >= 0) // If doc id is valid { lossValue -= (qdCts[i][j]+sc_smoother) * log(qdCts[i][j]/(normalizer+0.0) + deltaP); // n_ij * logp_ij, in theory, p_ij should approach 0 when n_ij approaches 0 num_qdpairs++; } } } lossValue /= num_qdpairs; return (lossValue); } // !!! These two functions are to calculate the partial derivative matrix of R with respect to a perticular w1_ij (at the intermediate layer) // Note: this is the most crucial part of the learning process and paralell computing can be used for different w1_ij's // Input: // i_W: row index // j_W: column index // I[]: indices of sampled queries // qd[][]: (query id, doc id) matrix, 10 columns by default // x1q[][]: query features matrix at the intermediate layer // zq[][]: final query features matrix (no bias) // zd[][]: final doc features matrix (no bias) // zq2norm[]: 2-norm vector of zq[][] // zd2norm[]: 2-norm vector of zd[][] // R[][]: query-doc cosine similarity matrix, 10 columns by default // Output: // dRq[][]: partial derivative of R[][] with respect to w1_ij in W1q[][] void partialDerivativeOfR_query(int i_W, int j_W, vector &I, vector> &qd, vector> &x1q, vector> &zq, vector> &zd, vector &zq2norm, vector &zd2norm, vector> &R, vector> &dRq) { int m = qd.size(); // Number of queries int n = qd[0].size(); // Number of docs per query int i,j,d; int iSample; int nSamples = I.size(); for (iSample = 0; iSample < nSamples; iSample++) { i = I[iSample]; // for the i-th query for (j = 0; j < n; j++) // for the j-th doc under the i-th query { d = qd[i][j]; // doc id if (d >= 0) // If doc id is valid { dRq[i][j] = zd[d][i_W]/zd2norm[d] - R[i][j]*zq[i][i_W]/zq2norm[i]; dRq[i][j] *= (1.0 - zq[i][i_W]*zq[i][i_W]) * x1q[i][j_W]; dRq[i][j] /= zq2norm[i]; } } } } // Input: // i_W: row index // j_W: column index // I[]: indices of sampled queries // qd[][]: (query id, doc id) matrix, 10 columns by default // x1d[][]: doc features matrix at the intermediate layer // zq[][]: final query features matrix (no bias) // zd[][]: final doc features matrix (no bias) // zq2norm[]: 2-norm vector of zq[][] // zd2norm[]: 2-norm vector of zd[][] // R[][]: query-doc cosine similarity matrix, 10 columns by default // Output: // dRd[][]: partial derivative of R[][] with respect to w1_ij in W1d[][] void partialDerivativeOfR_doc(int i_W, int j_W, vector &I, vector> &qd, vector> &x1d, vector> &zq, vector> &zd, vector &zq2norm, vector &zd2norm, vector> &R, vector> &dRd) { int m = qd.size(); // Number of queries int n = qd[0].size(); // Number of docs per query int i,j,d; int iSample; int nSamples = I.size(); for (iSample = 0; iSample < nSamples; iSample++) { i = I[iSample]; // for the i-th query for (j = 0; j < n; j++) // for the j-th doc under the i-th query { d = qd[i][j]; // doc id if (d >= 0) // If doc id is valid { dRd[i][j] = zq[i][i_W]/zq2norm[i] - R[i][j]*zd[d][i_W]/zd2norm[d]; dRd[i][j] *= (1.0 - zd[d][i_W]*zd[d][i_W]) * x1d[d][j_W]; dRd[i][j] /= zd2norm[d]; } } } } // !!! These two functions are to calculate the partial derivative matrix of R with respect to a perticular w2_ij // Note: this is the most crucial part of the learning process and paralell computing can be used for different w2_ij's // !!! To write two functions here (April 23, 2016) // Input: // i_W: row index // j_W: column index // I[]: indices of sampled queries // qd[][]: (query id, doc id) matrix, 10 columns by default // x1q[][]: query features matrix at the intermediate layer // x2q[][]: query features matrix at the input layer // zq[][]: final query features matrix (no bias) // zd[][]: final doc features matrix (no bias) // zq2norm[]: 2-norm vector of zq[][] // zd2norm[]: 2-norm vector of zd[][] // W1q[][]: the query part neural network mapping matrix, at the intermediate layer // R[][]: query-doc cosine similarity matrix, 10 columns by default // Output: // dRq[][]: partial derivative of R[][] with respect to w2_ij in W2q[][] void partialDerivativeOfR_query_add1layer(int i_W, int j_W, vector &I, vector> &qd, vector> &x1q, vector> &x2q, vector> &zq, vector> &zd, vector &zq2norm, vector &zd2norm, vector> &W1q, vector> &R, vector> &dRq) { int m = qd.size(); // Number of queries int n = qd[0].size(); // Number of docs per query int i,j,d; int iSample; int nSamples = I.size(); int k; int n_z = zq[0].size(); double aTerm; for (iSample = 0; iSample < nSamples; iSample++) { i = I[iSample]; // for the i-th query for (j = 0; j < n; j++) // for the j-th doc under the i-th query { d = qd[i][j]; // doc id if (d >= 0) // If doc id is valid { dRq[i][j] = 0.0; for (k = 0; k < n_z; k++) { aTerm = zd[d][k]/zd2norm[d] - R[i][j]*zq[i][k]/zq2norm[i]; aTerm *= (1.0 - zq[i][k]*zq[i][k]) * W1q[k][i_W]; dRq[i][j] += aTerm; } dRq[i][j] /= zq2norm[i]; // At this point, it's dR(q,d) / dx^(q)_i in equation (11) dRq[i][j] *= (1.0 - x1q[i][i_W]*x1q[i][i_W]) * x2q[i][j_W]; // Now, it's equation (13) } } } } // Input: // i_W: row index // j_W: column index // I[]: indices of sampled queries // qd[][]: (query id, doc id) matrix, 10 columns by default // x1d[][]: doc features matrix at the intermediate layer // x2d[][]: doc features matrix at the input layer // zq[][]: final query features matrix (no bias) // zd[][]: final doc features matrix (no bias) // zq2norm[]: 2-norm vector of zq[][] // zd2norm[]: 2-norm vector of zd[][] // W1d[][]: the doc part neural network mapping matrix, at the intermediate layer // R[][]: query-doc cosine similarity matrix, 10 columns by default // Output: // dRd[][]: partial derivative of R[][] with respect to w2_ij in W2d[][] void partialDerivativeOfR_doc_add1layer(int i_W, int j_W, vector &I, vector> &qd, vector> &x1d, vector> &x2d, vector> &zq, vector> &zd, vector &zq2norm, vector &zd2norm, vector> &W1d, vector> &R, vector> &dRd) { int m = qd.size(); // Number of queries int n = qd[0].size(); // Number of docs per query int i,j,d; int iSample; int nSamples = I.size(); int k; int n_z = zd[0].size(); double aTerm; for (iSample = 0; iSample < nSamples; iSample++) { i = I[iSample]; // for the i-th query for (j = 0; j < n; j++) // for the j-th doc under the i-th query { d = qd[i][j]; // doc id if (d >= 0) // If doc id is valid { dRd[i][j] = 0.0; for (k = 0; k < n_z; k++) { aTerm = zq[i][k]/zq2norm[i] - R[i][j]*zd[d][k]/zd2norm[d]; aTerm *= (1.0 - zd[d][k]*zd[d][k]) * W1d[k][i_W]; dRd[i][j] += aTerm; } dRd[i][j] /= zd2norm[d]; // At this point, it's dR(q,d) / dx^(d)_i in equation (12) dRd[i][j] *= (1.0 - x1d[d][i_W]*x1d[d][i_W]) * x2d[d][j_W]; // Now, it's equation (14) } } } } // This function is to do the group sampling from the query set of the training data // The queries are divided into groups by their probabilities vector querySampling(vector &qf, int nSamples, int nGroups) { // Begin function vector I; // I[] is to store the ids of the sampled queries int m = qf.size(); // Number of queries int i,j,k,kk,kCt; if (nSamples >= m) { for (i = 0; i < m; i++) I.push_back(i); } else // nSamples < m { // else // Frequency grouping vector selected(m, 0); vector groupCts(nGroups, 0); vector headIndices(nGroups, 0); vector tailIndices(nGroups, 0); double fmax; double fmin; double deltaf; // Find max freq. and min freq. fmax = qf[0]; fmin = qf[0]; for (i = 0; i < m; i++) { if (fmax < qf[i]) fmax = qf[i]; if (fmin > qf[i]) fmin = qf[i]; } // Assign group number to each query, count how many in each group, find the head and tail indices of each group deltaf = (fmax - fmin) / nGroups; for (i = 0; i < m; i++) // for the i-th query { //j = 1; //while (qf[i] < fmax - j*deltaf) // i.e. j < (fmax - qf[i])/deltaf // j++; j = floor((fmax - qf[i])/deltaf) + 1; if (j > nGroups) j = nGroups; // Now j is the group number (starting from 1) of query i groupCts[j-1]++; if (groupCts[j-1] == 1) headIndices[j-1] = i; tailIndices[j-1] = i; } // Now groupCts[j-1] stores the number of queries in group j, and headIndices[j-1] stores the head index of group j // Initialize random number generator srand(time(NULL)); // Sampling i = 0; // i will count number of queries sampled while (i < nSamples) { // Sample a group number, that is randomly select one from 1, 2, ..., nGroups j = rand() % nGroups + 1; // Sample a query from group j if (groupCts[j-1] > 0) { kk = rand() % groupCts[j-1] + 1; // The kk-th one in group j kCt = 0; k = headIndices[j-1]; while (k <= tailIndices[j-1]) { if (selected[k] == 0) // If the k-th query has not been selected { kCt++; if (kCt == kk) break; } k++; } I.push_back(k); selected[k] = 1; groupCts[j-1]--; i++; // To sample the next query } } } // End else return (I); } // End function // This function is to learn the parameters of the semantic model of version 2.0, using the stochastic gradient descent search // Added by Xugang on 2016-05-08 // Input: // qd[][]: (query id, doc id) matrix, 10 columns by default // qdCts[][]: (query id, doc count) matrix, 10 columns by default // sc_smoother: smooth factor for the count score // xq[][]: query features matrix (The first column of feature matrix is a column of 1's) // xd[][]: doc features matrix (The first column of feature matrix is a column of 1's) // learnRate: learning rate // tmax: maximum number of iteration // qf[]: query frequency vector // nSamples: number of sampled queries // nGroups: number of query probability groups // Output: // W1q[][]: the query part neural network mapping matrix, at the intermediate layer, m by n // W1d[][]: the doc part neural network mapping matrix, at the intermediate layer, m by n // W2q[][]: the query part neural network mapping matrix, at the input layer, m by n (The first column is the bias column) // W2d[][]: the doc part neural network mapping matrix, at the input layer, m by n (The first column is the bias column) // R[][]: query-doc cosine similarity matrix (for the training data) // relProb[][]: query-doc relevance probability matrix, 10 columns by default // lossValues[]: the loss values in iterations, the size of this array is tmax void SemanticModelTwoLayerV01_Learning(vector> &qd, vector> &qdCts, double sc_smoother, vector> &xq, vector> &xd, double learnRate, int tmax, double gamma, vector &qf, int nSamples, int nGroups, vector> &W1q, vector> &W1d, vector> &W2q, vector> &W2d, vector> &R, vector> &relProb, vector &lossValues) { // Begin function int m = qd.size(); // Number of queries int n = qd[0].size(); // Number of docs per query int numdocs = xd.size(); // Number of docs int i,j,d,k,dd; int mW1 = W1q.size(); // Output dimension mW1 int nW1q = W1q[0].size(); // Input dimension nW1q int nW1d = W1d[0].size(); // Input dimension nW1d int mW2q = W2q.size(); // Output dimension, note that mW2q = nW1q int nW2q = W2q[0].size(); // Input dimension nW2q int mW2d = W2d.size(); // Output dimension, note that mW2d = nW1d int nW2d = W2d[0].size(); // Input dimension nW2d int i_W, j_W; double dL, dLterm; vector I; int iSample; // Request memory chunks vector z(mW1, 0.0); // mW1 mapped features per query(doc) vector> zq; vector> zd; for (i = 0; i < m; i++) // for the i-th query zq.push_back(z); for (i = 0; i < numdocs; i++) // for the i-th doc zd.push_back(z); vector zq2norm(m, 0.0); vector zd2norm(numdocs, 0.0); vector x1q_in(nW1q, 0.0); vector x1d_in(nW1d, 0.0); vector> x1q; vector> x1d; for (i = 0; i < m; i++) // for the i-th query x1q.push_back(x1q_in); for (i = 0; i < numdocs; i++) // for the i-th doc x1d.push_back(x1d_in); vector p(n, 0.0); // n docs per query vector> dRq; vector> dRd; for (i = 0; i < m; i++) // for the i-th query dRq.push_back(p); for (i = 0; i < m; i++) // for the i-th query dRd.push_back(p); // Main iterations int t = 0; while (t < tmax) { cout << "iteration: " << t << endl; // !!! Feedforward process // Obtain the mapped feature vectors via feed-forward calculation mappingMatrix_x2y(xq, W2q, x1q); // W2q[][]: mW2q by nW2q mappingMatrix_x2y(x1q, W1q, zq); // W1q[][]: mW1 by nW1q (= mW2q) mappingMatrix_x2y(xd, W2d, x1d); // W2d[][]: mW2d by nW2d mappingMatrix_x2y(x1d, W1d, zd); // W1d[][]: mW1 by nW1d (= mW2d) // Compute the 2-norms of the mapped feature vectors vectorsOf2norms(zq, zq2norm); vectorsOf2norms(zd, zd2norm); // Compute the cosine similarity matrix cosineSimMatrix(qd, zq, zd, zq2norm, zd2norm, R); // Compute the relevance probability matrix relevanceProb(qd, R, gamma, relProb); // Compute the current loss value lossValues[t] = SemanticModelOneLayerV01_Loss(qd, qdCts, sc_smoother, relProb); // !!! Backpropagation process (note that: W2q[][] and W2d[][] first, then W1q[][] and W1d[][]) // Query sampling I = querySampling(qf, nSamples, nGroups); cout << "Number of sampled queries: " << I.size() << ", loss value per (query, doc)-pair: " << lossValues[t] << endl; // Update W2q[][] for (i_W = 0; i_W < mW2q; i_W++) { for (j_W = 0; j_W < nW2q; j_W++) { // for (i_W, j_W) // Compute the partial derivative matrix of R with respect to a perticular w2_ij in W2q partialDerivativeOfR_query_add1layer(i_W, j_W, I, qd, x1q, xq, zq, zd, zq2norm, zd2norm, W1q, R, dRq); // Find the partial derivative of the loss function with respect to W2q[i_W][j_W] dL = 0.0; for (iSample = 0; iSample < nSamples; iSample++) { i = I[iSample]; // for the i-th query for (j = 0; j < n; j++) // for the j-th doc under the i-th query { d = qd[i][j]; // doc id if (d >= 0) // If doc id is valid { dLterm = 0.0; for (k = 0; k < n; k++) { dd = qd[i][k]; // doc id if (dd >= 0 && k != j) // If doc id is valid dLterm += relProb[i][k] * (dRq[i][j] - dRq[i][k]); // P(d'|q)*(dR(q,d) - dR(q,d')) } dLterm *= (qdCts[i][j] + sc_smoother); dL += dLterm; } } } dL *= -gamma; // Update W2q[i_W][j_W] W2q[i_W][j_W] = W2q[i_W][j_W] - learnRate*dL; } } // End of updating W2q[][] // Update W2d[][] for (i_W = 0; i_W < mW2d; i_W++) { for (j_W = 0; j_W < nW2d; j_W++) { // for (i_W, j_W) // Compute the partial derivative matrix of R with respect to a perticular w2_ij in W2d partialDerivativeOfR_doc_add1layer(i_W, j_W, I, qd, x1d, xd, zq, zd, zq2norm, zd2norm, W1d, R, dRd); // Find the partial derivative of the loss function with respect to W2d[i_W][j_W] dL = 0.0; for (iSample = 0; iSample < nSamples; iSample++) { i = I[iSample]; // for the i-th query for (j = 0; j < n; j++) // for the j-th doc under the i-th query { d = qd[i][j]; // doc id if (d >= 0) // If doc id is valid { dLterm = 0.0; for (k = 0; k < n; k++) { dd = qd[i][k]; // doc id if (dd >= 0 && k != j) // If doc id is valid dLterm += relProb[i][k] * (dRd[i][j] - dRd[i][k]); // P(d'|q)*(dR(q,d) - dR(q,d')) } dLterm *= (qdCts[i][j] + sc_smoother); dL += dLterm; } } } dL *= -gamma; // Update W2d[i_W][j_W] W2d[i_W][j_W] = W2d[i_W][j_W] - learnRate*dL; } } // End of updating W2d[][] // Update W1q[][] for (i_W = 0; i_W < mW1; i_W++) { for (j_W = 0; j_W < nW1q; j_W++) { // for (i_W, j_W) // Compute the partial derivative matrix of R with respect to a perticular w1_ij in W1q partialDerivativeOfR_query(i_W, j_W, I, qd, x1q, zq, zd, zq2norm, zd2norm, R, dRq); // Find the partial derivative of the loss function with respect to W1q[i_W][j_W] dL = 0.0; for (iSample = 0; iSample < nSamples; iSample++) { i = I[iSample]; // for the i-th query for (j = 0; j < n; j++) // for the j-th doc under the i-th query { d = qd[i][j]; // doc id if (d >= 0) // If doc id is valid { dLterm = 0.0; for (k = 0; k < n; k++) { dd = qd[i][k]; // doc id if (dd >= 0 && k != j) // If doc id is valid dLterm += relProb[i][k] * (dRq[i][j] - dRq[i][k]); // P(d'|q)*(dR(q,d) - dR(q,d')) } dLterm *= (qdCts[i][j] + sc_smoother); dL += dLterm; } } } dL *= -gamma; // Update W1q[i_W][j_W] W1q[i_W][j_W] = W1q[i_W][j_W] - learnRate*dL; } } // End of updating W1q[][] // Update W1d[][] for (i_W = 0; i_W < mW1; i_W++) { for (j_W = 0; j_W < nW1d; j_W++) { // for (i_W, j_W) // Compute the partial derivative matrix of R with respect to a perticular w1_ij in W1d partialDerivativeOfR_doc(i_W, j_W, I, qd, x1d, zq, zd, zq2norm, zd2norm, R, dRd); // Find the partial derivative of the loss function with respect to W1d[i_W][j_W] dL = 0.0; for (iSample = 0; iSample < nSamples; iSample++) { i = I[iSample]; // for the i-th query for (j = 0; j < n; j++) // for the j-th doc under the i-th query { d = qd[i][j]; // doc id if (d >= 0) // If doc id is valid { dLterm = 0.0; for (k = 0; k < n; k++) { dd = qd[i][k]; // doc id if (dd >= 0 && k != j) // If doc id is valid dLterm += relProb[i][k] * (dRd[i][j] - dRd[i][k]); // P(d'|q)*(dR(q,d) - dR(q,d')) } dLterm *= (qdCts[i][j] + sc_smoother); dL += dLterm; } } } dL *= -gamma; // Update W1d[i_W][j_W] W1d[i_W][j_W] = W1d[i_W][j_W] - learnRate*dL; } } // End of updating W1d[][] t++; // To next iteration } } // End function // This function is to learn the parameters of the semantic model of version 2.0, using the stochastic gradient descent search // Added by Xugang on 2016-04-23 // To write a function here vector myStrSplitV01(string str, char delimiter) { vector result; stringstream ss(str); // Turn the string into a stream string aToken; while (getline(ss, aToken, delimiter)) result.push_back(aToken); return (result); }