提交所有LaTeX文件。

2024-09-02 17:47:53 +08:00
parent 10e6004666
commit b2fb901612
131 changed files with 17921 additions and 0 deletions
--- a/深度学习/实验/Retrieval_task/实验报告.tex
+++ b/深度学习/实验/Retrieval_task/实验报告.tex
@@ -0,0 +1,110 @@
+\documentclass[a4paper]{ctexart}
+\usepackage[margin=1in]{geometry}
+\usepackage{booktabs}
+\usepackage{hyperref}
+\usepackage{graphicx}
+\usepackage[numbers]{gbt7714}
+\RequirePackage[outputdir=./latex-output]{minted}
+\setlength{\belowcaptionskip}{1em}
+
+\title{《深度学习》实验报告}
+\author{姓名：岳锦鹏\qquad 学号：10213903403\qquad 专业：统计学-计算机\qquad 学院：统计学院}
+\date{2024年6月8日}
+\ctexset {
+    section = {
+        name = {,、}, 
+        format += \raggedright,
+        number = \chinese{section},
+    },
+    subsection = {
+        % name = {（,）},
+        number = \arabic{subsection}
+    }
+}
+
+\begin{document}
+    \maketitle
+    \section{实验环境}
+    \noindent requirements.txt：
+    \begin{minted}[frame=leftline, framesep=1em, framerule=1pt]{python}
+numpy
+paddlepaddle-gpu
+scikit-learn
+tqdm
+    \end{minted}
+    这些代码库的作用是什么已经显而易见了，其中 \mintinline{Python}{scikit-learn (sklearn)} 只是用来分训练验证集的。
+
+    \section{实验过程}
+    \subsection{实验思路}
+    \subsubsection{确定首次召回个数}
+    先尝试了一下完全不使用baseline的方法，直接从全部的几千个文档中召回3个文档，发现效果完全不如baseline，于是想到了采用二次召回，第一次采用baseline的方法，从几千个文档中召回一部分，第二次再从召回的这些文档中选出3个。关于第一次召回多少，我首先做了按照baseline的方式直接计算余弦相似度的top k召回率的实验，结果如 表 \ref{first retrieve num} 所示，可以看到当首次召回数量达到500个时已经有很高的召回率了，所以后续的实验都在首次召回500个文章下进行。
+    \begin{table}[h]
+        \centering \caption{直接计算余弦相似度的召回率} \label{first retrieve num} 
+        \begin{tabular}{cc}
+        \toprule
+         Recall@100 & 0.75 \\
+         Recall@500 & 0.93 \\
+         Recall@5000 & 0.99 \\
+        \bottomrule
+        \end{tabular}
+    \end{table}
+
+    \subsubsection{二次召回方案选择}
+    从500个里召回3个，仍然是一个复杂的任务，尝试过几种方案：
+    \begin{enumerate}
+        \item 将其当做分类问题，输入一个查询，500个文章，输出这500个文章的概率。缺点是效率可能较低，而且实验后发现效果也不好；
+        \item 双塔编码(cross encoder)和point wise 对比学习\cite{jianshu}，给定一个查询和一个文章，模型给出一个得分，对于查询和对应的文章（正例），得分应该更高；对于查询和不对应的文章（负例），得分应该更低，这里的负例一般是从文章库中随机选取。实验后发现不管正负例的比例是多少，效果都不好；
+        \item 单塔编码，query通过查询编码器，fact通过文章编码器，之后计算编码后的向量的余弦相似度，如果不使用对比学习会导致模型把所有的文章都编码得非常相似，所以这里还是需要使用对比学习。那么如何选取负例？实验过还是从文章库中随机选取负例，效果还是不好，于是查找资料，发现了这样一篇文章：\cite{aistudio}，里面提到了In-batch Negatives策略，即将一个批次内其他样本都作为负例，尝试后发现效果有很大提升，从baseline的0.2121提升到了0.4059。
+    \end{enumerate}
+
+    最终选择了单塔编码的方案，并且使用了In-batch Negatives策略。
+    \subsection{数据预处理部分}
+    先使用 \mintinline{Python}{sklearn} 中的  \mintinline{Python}{train_test_split} 按8:2的比例分出训练集和验证集。对于训练集和验证集，先把首次召回500个的工作全部完成，即对于每一个查询，先召回500个文章，存放在内存中，训练时在这500 $\times $ num of queries 个样本里训练。
+    \subsection{模型构建}
+    这里需要做的就是在单塔编码中，如何编码查询，如何编码文章。其实非常简单，就是多层全连接层，使用ReLU作为激活函数，并且使用了残差连接，如图  \ref{model structure} 所示。
+    \begin{figure}[h]
+        \centering
+        \includegraphics[width=1\linewidth]{模型结构图.png}
+        \caption{模型结构图}\label{model structure}
+        （图中的N表示batch size，EMB表示嵌入维度。）
+    \end{figure}
+
+    由于query和fact的每一行是对应的，所以将query和fact通过编码器后的向量，在对角线的位置表示查询和文档对应时的相似度（正例），而非对角线的位置表示非对应时的相似度（负例），所以此矩阵应该和单位矩阵相近，所以可以用交叉熵损失。在实际代码中使用了小于1的margin来代替1形成对角阵。
+
+    这里只使用简单的线性层加残差连接，是因为试了很多种结构，比如更深的带残差块的线性层，比如Transformer（Decoder only），效果都没有简单的线性层效果好。
+
+    \section{实验结果}
+    （以下召回率都是指Recall@3）
+
+    % 训练集和验证集比例为8:2，batch size设为1024，训练370轮，优化器为Adam，在验证集上的召回率为0.48165760869565216，提交后（在测试集上）的召回率为0.452148。
+
+    % 不区分训练集和验证集，batch size设为4096，训练1300轮，优化器为Adam，提交后（在测试集上）的召回率为0.529622。
+
+    % 不区分训练集和验证集，batch size设为2048，训练2500步(iter steps)（不是轮(epochs)了），优化器为Adam，提交后（在测试集上）的召回率为0.523275。
+
+    % 不区分训练集和验证集，batch size设为2048，训练2600步，优化器为AdamW，提交后（在测试集上）的召回率为0.523926。
+
+    % 不区分训练集和验证集，batch size设为2048，训练5100步，优化器为AdamW，将模型参数与输入输出的数据类型从float32改为float64，提交后（在测试集上）的召回率为0.584635。
+
+    结果如表  \ref{hyper-parameters} 所示。
+    \begin{table}[h]
+        \centering
+        \caption{尝试不同的超参数}\label{hyper-parameters}
+        \begin{tabular}{cccccccc}
+        \toprule
+         序号 & 是否有验证集 & batch size & epoch / step & 优化器 & 数据类型 & 验证集召回率 & 测试集召回率 \\
+         \midrule
+         0 & \multicolumn{4}{c}{baseline 无需训练} & float32 & / & 0.212077 \\
+         1 & 是 & 1024 & 370 epochs & Adam & float32 & 0.481658 & 0.452148 \\
+         2 & 否 & 4096 & 1300 epochs & Adam & float32 & / & 0.529622 \\
+         3 & 否 & 2048 & 2500 steps & Adam & float32 & / & 0.523275 \\
+         4 & 否 & 2048 & 2600 steps & AdamW & float32 & / & 0.523926 \\
+         5 & 否 & 2048 & 5100 steps & AdamW & float64 & / & 0.584635 \\
+         6 & 否 & 8192 & 9500 steps & AdamW & float64 & / & \textbf{0.606283} \\
+        \bottomrule
+        \end{tabular}
+    \end{table}
+
+    \bibliography{ref}
+
+\end{document}
--- a/深度学习/实验讲解/lab4.tex
+++ b/深度学习/实验讲解/lab4.tex
@@ -0,0 +1,441 @@
+% https://zhuanlan.zhihu.com/p/165140693
+% https://zhuanlan.zhihu.com/p/36868831
+
+%声明文档类型和比例
+\documentclass[aspectratio=169, 10pt, utf8, mathserif]{ctexbeamer}
+%调用相关的宏包
+% \usepackage{beamerfoils}
+
+\usepackage[outputdir=./latex-output]{minted}
+
+\usepackage{multicol}
+\setminted{breaklines=true, fontsize=\zihao{-6}}
+% \PassOptionsToPackage{fontsize=\zihao{-6}}{minted}
+
+\definecolor{shadecolor}{RGB}{204,232,207}
+
+\usetheme{Berlin} %主题包之一，直接换名字即可
+\setbeamertemplate{page number in head/foot}[totalframenumber]
+
+\usecolortheme{beaver} %主题色之一，直接换名字即可。
+\usefonttheme{professionalfonts}
+
+% 设置用acrobat打开就会全屏显示
+\hypersetup{pdfpagemode=FullScreen}
+
+% 设置logo
+% \pgfdeclareimage[height=2cm, width=2cm]{university-logo}{120701101}
+% \logo{\pgfuseimage{university-logo}}
+
+\parskip=1.2em
+
+%--------------正文开始---------------
+\begin{document}
+
+%每个章节都有小目录
+\AtBeginSubsection[]
+{
+ \begin{frame}<beamer>
+   \tableofcontents[currentsection,currentsubsection]
+ \end{frame}
+}
+
+\title{《深度学习》实验4讲解}
+\subtitle{多层感知机/全连接层}
+\author[岳锦鹏]{岳锦鹏 \\ \small 10213903403}
+
+\date{\today}
+\begin{frame}
+    %\maketitle
+    \titlepage
+\end{frame}
+
+\begin{frame}
+	\frametitle{目录}
+	\tableofcontents[hideallsubsections]
+\end{frame}
+
+\section{整体浏览}
+
+\begin{frame}[fragile]
+    首先逐个观察每个填空的部分需要完成哪些内容。
+
+    可以看到需要完成ReLU的反向传播过程。
+    \begin{minted}{python}
+    class Relu:
+        def __init__(self):
+            self.mem = {}
+
+        def forward(self, x):
+            self.mem['x'] = x
+            return np.where(x > 0, x, np.zeros_like(x))
+
+        def backward(self, grad_y):
+            '''
+            grad_y: same shape as x
+            '''
+
+            # ==========
+            # todo '''请完成激活函数的梯度后传'''
+            # ==========
+
+    \end{minted}
+\end{frame}
+
+\begin{frame}[fragile]
+    对于主要的模型部分，需要完成计算损失。
+    \begin{minted}{python}
+        def compute_loss(self, log_prob, labels):
+            '''
+            log_prob is the predicted probabilities
+            labels is the ground truth
+            Please return the loss
+            '''
+
+            # ==========
+            # todo '''请完成多分类问题的损失计算 损失为： 交叉熵损失 + L2正则项'''
+            # ==========
+
+
+    \end{minted}
+\end{frame}
+
+\begin{frame}[fragile]
+    按照给定的网络结构完成前向传播过程。
+    \begin{minted}{python}
+        def forward(self, x):
+            '''
+            x is the input features
+            Please return the predicted probabilities of x
+            '''
+
+            # ==========
+            # todo '''请搭建一个MLP前馈神经网络 补全它的前向传播 MLP结构为FFN --> RELU --> FFN --> Softmax'''
+            # ==========
+
+
+    \end{minted}
+\end{frame}
+
+\begin{frame}[fragile]
+    完成主模型的后向传播，注意这里可以使用其中各层的反向传播函数。
+    \begin{minted}{python}
+        def backward(self, label):
+            '''
+            label is the ground truth
+            Please compute the gradients of self.W1 and self.W2
+            '''
+
+            # ==========
+            # todo '''补全该前馈神经网络的后向传播算法'''
+            # ==========
+
+
+    \end{minted}
+\end{frame}
+
+\begin{frame}[fragile]
+    更新参数，这里要注意不要忘记正则项的损失。
+    \begin{minted}{python}
+        def update(self):
+            '''
+            Please update self.W1 and self.W2
+            '''
+
+            # ==========
+            # todo '''更新该前馈神经网络的参数'''
+            # ==========
+
+    \end{minted}
+\end{frame}
+
+\section{逐个实现}
+\subsection{ReLU的反向传播}
+
+\begin{frame}[fragile]
+    \begin{multicols}{2}
+        首先看ReLU的反向传播，由于ReLU的公式为（符号和课件中保持一致所以用了$a$和$x$）
+        $$
+        a = \begin{cases}
+        x,\quad & x>0 \\
+        0,\quad & x\leqslant 0 \\
+        \end{cases}
+        $$
+        所以显然
+        $$
+        \frac{\mathrm{d}a}{\mathrm{d}x} = \begin{cases}
+        1,\quad & x>0 \\
+        0,\quad & x\leqslant 0 \\
+        \end{cases}
+        $$
+        \columnbreak
+        \begin{minted}{python}
+        class Relu:
+            def __init__(self):
+                self.mem = {}
+
+            def forward(self, x):
+                self.mem['x'] = x
+                return np.where(x > 0, x, np.zeros_like(x))
+
+            def backward(self, grad_y):
+                '''
+                grad_y: same shape as x
+                '''
+
+                # ==========
+                # todo '''请完成激活函数的梯度后传'''
+                # ==========
+
+        \end{minted}
+    \end{multicols}
+\end{frame}
+
+\begin{frame}[fragile]
+    
+    \begin{multicols}{2}
+        由于要计算梯度时要根据输入$x$是否大于0判断，所以这里使用了\mintinline{python}{self.mem}来记忆上次输入的$x$，在反向传播的时候就可以使用记忆的$x$来进行分支，这里可以利用 numpy的批量操作能力实现，\mintinline{python}{grad_y}是传入的梯度，返回的结果应为本层梯度与传入梯度的乘积：
+        $$
+        return = \frac{\mathrm{d}a}{\mathrm{d}x} \times grad\_y=\begin{cases}
+        grad\_y,\quad & x>0 \\
+        0,\quad & x\leqslant 0 \\
+        \end{cases}
+        $$
+        因此写出代码如下：
+        \columnbreak
+        \begin{minted}{python}
+        class Relu:
+            def __init__(self):
+                self.mem = {}
+    
+            def forward(self, x):
+                self.mem['x'] = x
+                return np.where(x > 0, x, np.zeros_like(x))
+    
+            def backward(self, grad_y):
+                '''
+                grad_y: same shape as x
+                '''
+    
+                # ==========
+                # todo '''请完成激活函数的梯度后传'''
+                return np.where(self.mem['x'] > 0, grad_y, np.zeros_like(grad_y))
+                # ==========
+    
+        \end{minted}
+        \end{multicols}
+        \mint{python}|return np.where(self.mem['x'] > 0, grad_y, np.zeros_like(grad_y))|
+\end{frame}
+
+\subsection{交叉熵损失+L2正则项}
+\begin{frame}[fragile]
+    \begin{multicols}{2}
+        交叉熵损失的函数为
+        $$
+        loss=\sum_{\text{每个类别}i} -y_i \log(\hat{y}_i) 
+        $$
+        L2正则项的损失为
+        $
+        \lambda \left\Vert W \right\Vert 
+        $，$\lambda$为系数，$W$为权重，距离用的是欧几里得距离，即
+        $$\displaystyle \sqrt{\sum_{W\text{中的每个参数}x} x^{2} }$$
+
+        这里有两层网络，也就是两层权重，所以
+        $$
+        L2 = \lambda_1 \left\Vert W_1 \right\Vert +\lambda_2 \left\Vert W_2 \right\Vert 
+        $$
+        \columnbreak
+        \begin{minted}{python}
+            def compute_loss(self, log_prob, labels):
+                '''
+                log_prob is the predicted probabilities
+                labels is the ground truth
+                Please return the loss
+                '''
+
+                # ==========
+                # todo '''请完成多分类问题的损失计算 损失为： 交叉熵损失 + L2正则项'''
+                # ==========
+
+
+        \end{minted}
+    \end{multicols}
+\end{frame}
+
+\begin{frame}[fragile]
+    \begin{multicols}{2}
+        \mintinline{python}{log_prob}应该是希望传入已经经过$\log$计算的$\hat{y}$，但是在lab4.ipynb里发现其实是没有经过$\log$计算的\mintinline{python}{pred_y}，这里还得自己计算$\log(\hat{y})$，但是$\log (\hat{y}_i)$由于在前向传播的时候计算过就提前缓存在\mintinline{python}{self.log_value}了。
+
+        \mintinline{python}{labels}|$y$和\mintinline{python}{self.log_value}|$\log(\hat{y})$是one-hot编码的，形状为[批大小，类别数]，根据公式在类别数维度求和，所以是\mintinline{python}{axis=1}。注意还要在批大小维度求平均，即\mintinline{python}{.mean(0)}。
+
+        计算距离这里直接使用了\mintinline{python}{np.linalg.norm}。
+        \columnbreak
+        \begin{minted}{python}
+            def compute_loss(self, log_prob, labels):
+                '''
+                log_prob is the predicted probabilities
+                labels is the ground truth
+                Please return the loss
+                '''
+
+                # ==========
+                # todo '''请完成多分类问题的损失计算 损失为： 交叉熵损失 + L2正则项'''
+                return - np.sum(labels * self.log_value, axis=1).mean(0) + self.lambda1 * np.linalg.norm(self.W1) + self.lambda1 * np.linalg.norm(self.W2)
+                # ==========
+
+
+        \end{minted}
+    \end{multicols}
+\end{frame}
+
+\subsection{主模型的前向传播}
+
+\begin{frame}[fragile]
+    \begin{multicols}{2}
+        这里$x$的形状是[批大小，28，28]，这里的两个28分别是图像高度和宽度，而且可以观察到\mintinline{python}{self.W1}的形状是[100, 785]，但是$28\times 28=784$，说明需要把高度和宽度拉平后还需要拼接一个\mintinline{python}{np.ones}来替代偏置项的作用。即
+        \mint{python}|np.concatenate((x.reshape(x.shape[0], -1), np.ones((x.shape[0], 1))), axis=1)|
+
+        在\mintinline{python}{Matmul.backward}的注释中可以看到\\
+        \mintinline{python}{x: shape(d, N)}，所以拼接好之后还需要进行转置。
+        \columnbreak
+        \begin{minted}{python}
+            def forward(self, x):
+                '''
+                x is the input features
+                Please return the predicted probabilities of x
+                '''
+
+                # ==========
+                # todo '''请搭建一个MLP前馈神经网络 补全它的前向传播 MLP结构为FFN --> RELU --> FFN --> Softmax'''
+                # ==========
+
+
+        \end{minted}
+    \end{multicols}
+\end{frame}
+
+\begin{frame}[fragile]
+    \begin{multicols}{2}
+        在\mintinline{python}{Softmax.forward}的注释中可以看到\mintinline{python}{x: shape(N, c)}，因此在进行Softmax操作前还需要再转置回来。
+
+        理论上这时候就可以直接返回了，不需要用到\mintinline{python}{self.log}，$\log$是在计算交叉熵时才会用到的操作，但是在lab4.ipynb中非要先反向传播再计算损失，反向传播需要\mintinline{python}{self.log.backward}，但这又需要先调用过\mintinline{python}{self.log.forward}才能把输入记忆到\mintinline{python}{self.mem}中，才能正确返回梯度。
+        
+        那没办法，只能先调用一下\mintinline{python}{self.log.forward}把结果缓存起来。
+        \columnbreak
+        \begin{minted}{python}
+            def forward(self, x):
+                '''
+                x is the input features
+                Please return the predicted probabilities of x
+                '''
+
+                # ==========
+                # todo '''请搭建一个MLP前馈神经网络 补全它的前向传播 MLP结构为FFN --> RELU --> FFN --> Softmax'''
+                y = np.concatenate((x.reshape(x.shape[0], -1), np.ones((x.shape[0], 1))), axis=1).T  # 这形状真难弄
+                y = self.mul_h1.forward(self.W1, y)
+                y = self.relu.forward(y)
+                y = self.mul_h2.forward(self.W2, y).T
+                y = self.softmax.forward(y)
+                # print(y)
+                # 唉没办法，非要先反向传播再计算损失，那只能把log的结果缓存起来了
+                self.log_value = self.log.forward(y)
+                return y
+                # ==========
+
+
+        \end{minted}
+    \end{multicols}
+\end{frame}
+
+\subsection{主模型的反向传播}
+\begin{frame}[fragile]
+    \begin{multicols}{2}
+        前面的准备工作都实现了后，这里就很简单了，只需要逐层反向传播就行了。
+
+        注意交叉熵损失为 
+        $$
+        loss=\sum_{\text{每个类别}i} -y_i \log(\hat{y}_i) 
+        $$
+        所以
+        $$
+        \frac{\mathrm{d}loss}{\mathrm{d}\log(\hat{y}_i)}= -y_i 
+        $$
+        因此首个梯度为 \mintinline{python}{-label}，后续的反向传播就交给各层的\mintinline{python}{backward}函数了。
+        \columnbreak
+        \begin{minted}{python}
+            def backward(self, label):
+                '''
+                label is the ground truth
+                Please compute the gradients of self.W1 and self.W2
+                '''
+
+                # ==========
+                # todo '''补全该前馈神经网络的后向传播算法'''
+                # ==========
+
+
+        \end{minted}
+    \end{multicols}
+\end{frame}
+
+\begin{frame}[fragile]
+    \begin{multicols}{2}
+        仍然要注意在Softmax反向传播后需要转置一下。
+
+        \mintinline{python}{Matmul.backward}返回的结果为\mintinline{python}{return grad_x, grad_W}，这也提示了全连接层要保留对输入和对参数的求导，对输入的求导用来继续反向传播，对参数的求导用来更新参数。
+        \columnbreak
+        \begin{minted}{python}
+            def backward(self, label):
+                '''
+                label is the ground truth
+                Please compute the gradients of self.W1 and self.W2
+                '''
+
+                # ==========
+                # todo '''补全该前馈神经网络的后向传播算法'''
+                temp = self.log.backward(-label)
+                temp = self.softmax.backward(temp).T
+                temp, self.gradient2 = self.mul_h2.backward(temp)
+                temp = self.relu.backward(temp)
+                temp, self.gradient1 = self.mul_h1.backward(temp)
+                # ==========
+
+
+        \end{minted}
+    \end{multicols}
+\end{frame}
+
+\subsection{更新参数}
+\begin{frame}[fragile]
+    \begin{multicols}{2}
+        更新参数只需要按照公式即可，不要忘记L2正则项的梯度，以下以$W_1$为例，$W_2$同理。
+        
+        $W_1^{(i,j)}$表示$W_1$的第$i$行$j$列的元素，lr表示learning rate，即学习率。
+        $$
+        \frac{\mathrm{d}L2}{\mathrm{d}W_1^{(i,j)}}= \frac{2 \lambda_1 W_1^{(i,j)}}{\left\Vert W_1 \right\Vert }
+        $$
+
+        $$
+        W_1 = W_1 - \left( \frac{\mathrm{d}loss}{\mathrm{d}W_1}+\frac{\mathrm{d}L2}{\mathrm{d}W_1} \right) \times lr
+        $$
+        \columnbreak
+        \begin{minted}{python}
+            def update(self):
+                '''
+                Please update self.W1 and self.W2
+                '''
+
+                # ==========
+                # todo '''更新该前馈神经网络的参数'''
+                self.W1 -= (self.gradient1 + 2 * self.lambda1 * self.W1 / np.linalg.norm(self.W1)) * self.lr
+                self.W2 -= (self.gradient2 + 2 * self.lambda1 * self.W2 / np.linalg.norm(self.W2)) * self.lr
+                # ==========
+
+        \end{minted}
+    \end{multicols}
+\end{frame}
+
+\begin{frame}
+	\zihao{-4}\centering{感谢观看！}
+\end{frame}
+\end{document}