%%%%%%%%%%%% Subject: template for lecture notes %%%%%%%%%%%%%%%%%%%%
%
% Thank you for the hitnadvut to write up notes on today's lecture.
%
% Here is the template file for writing up notes for this course.
% If you scribe on a Tuesday we ask that you please bring your
% notes by the following Tuesday.
%
% To help you with your notes, I can let you copy the notes used in
% giving the lecture. Please ask
% for help in understanding the material or if you have a question
% about how it should be presented.
%
% In order to make the notes more uniform and easier to read, we are providing
% several macros for theorems and such. (Some examples of their use are
% included below.) Also, we ask that you follow a few stylistic conventions:
%
% Please write your notes as if you were explaining the material to another
% student, rather than as minutes of a meeting. (That is, don't write
% things like ``Today we started discussing...'' or ``next time,
% we will...'' or ``then, someone asked the question...''
% Just explain the material as clearly as you can.
%
% If you need to refer to an earlier lecture, use the lecture number.
% For example, ``In Lecture 6 it was shown that...''
% To refer to a later lecture for which you don't know the number,
% just say something like,
% ``This impossibility result will be discussed further in a later
% lecture.'' (At the end of the course, I will go back and fill in the
% appropriate lecture numbers.)
%
% If material from a handout is covered in lecture, then we would like to
% include the text of the handout as a part of the notes to make them more
% self-contained. To make your job easier, I will mail you the latex
% files for the relevent handouts, and you can merge them in at the
% appropriate places. Also, you may want to annotate the handout as you
% see fit. If you happen to take notes on a day that a homework is assigned,
% I will also mail you the exercises, which you can put at the end of your
% notes.
%
% Figures should be done in Latex, if at all possible. You may have available
% on your system a program called ``texdraw.'' The program lets to draw
% figures with the mouse and then generates a latex file, which you can then
% insert into your text. If you don't have this program, usually writing the
% latex commands yourself isn't that hard. For very complicated figures, you
% may wish to leave the appropriate space in the text and draw the figures
% neatly (each on a separate page) in black ink. For the first draft you
% hand in, you may not want to invest a lot of time drawing the figures very
% neatly, because there's a chance that we will want you to modify
% them. Save that effort for the draft to be handed out.
%
% Please let me know if you have any questions.
%
%
\documentstyle[12pt]{book}
%\setlength{\oddsidemargin}{-.1 in}
%\setlength{\evensidemargin}{-.3 in}
\setlength{\evensidemargin}{.0 in}
\addtolength{\topmargin}{-1in}
\setlength{\textwidth}{6.5in}
\setlength{\textheight}{8in}
%
% this command enables to remove a whole part of the text
% from the printout
% to use it just enter
% \remove{
% before the text to be excluded and
% }
% after the text
\newcommand{\remove}[1]{}
%
% The following macros are used to generate nice code for programs.
% See example on how to use it below
%
%%%%%%%%%%%%%%%%%%%%% program macros %%%%%%%%%%%%%%%%%
\newcommand{\Do}{{\small\bf do}\ }
\newcommand{\Return}{{\small\bf return\ }}
\newcommand{\Proc}[1]{#1\+}
\newcommand{\Returns}{{\small\bf returns}}
\newcommand{\Procbegin}{{\small\bf begin}}
\newcommand{\If}{{\small\bf if}\ \=\+}
\newcommand{\Then}{{\small\bf then}\ \=\+}
\newcommand{\Else}{\<{\small\bf else}\ \>}
\newcommand{\Elseif}{\<{\small\bf elseif}\ }
\newcommand{\Endif}{\<{\small\bf end\ if\ }\-\-}
\newcommand{\Endproc}[1]{{\small\bf end} #1\-}
\newcommand{\For}{{\small\bf for}\ \=\+}
\newcommand{\Endfor}{{\small\bf end\ for}\ \-}
\newcommand{\Loop}{{\bf loop}\ \= \+}
\newcommand{\Endloop}{{\bf end\ loop}\ \-}
\newenvironment{program}{
\begin{minipage}{\textwidth}
\begin{tabbing}
\ \ \ \ \=\kill
}{
\end{tabbing}
\end{minipage}
}
% a blank line
\def\blankline{\hbox{}}
%%%%%%%%%%%%%%%%%%%%% End of PROGRAM macros %%%%%%%%%%%%%%%%%
%
% The following macro is used to generate the header.
%
%
\newcommand{\lecture}[5]{
\pagestyle{headings}
\thispagestyle{plain}
\newpage
\setcounter{chapter}{#1}
\setcounter{page}{#2}
% \set\thechapter{#3}
\noindent
\begin{center}
\framebox{
\vbox{
\hbox to 6.28in { {\bf Computational Learning Theory
\hfill Fall Semester, 1992/3} }
\vspace{4mm}
\hbox to 6.28in { {\Large \hfill Lecture #1: #3 \hfill} }
\vspace{2mm}
\hbox to 6.28in { {\it Lecturer: #4 \hfill Scribe: #5} }
}
}
\end{center}
\markboth{Lecture #1: #3}{Lecture #1: #3}
\vspace*{4mm}
}
%
% Use these macros for organizing sections of your notes.
% Each command takes two arguments: (1) the title of the section and and
% (2) a keyword for that section to appear in the index. (See examples.)
% Please don't use \section, \subsection, and \subsubsection directly!
%
\newcommand{\topic}[2]{\section{#1} \index{#2} \markright{#1}}
\newcommand{\subtopic}[2]{\subsection{#1} \index{#2}}
\newcommand{\subsubtopic}[2]{\subsubsection{#1} \index{#2}}
%
% Convention for citations is first author's last name followed by other
% authors' last initials, followed by the year. For example, to cite the
% seventh entry in the course bibliography, you would type: \cite{BurnsL80}
% (To avoid bibliography problems, for now we redefine the \cite command.)
%
\renewcommand{\cite}[1]{[#1]}
%
% These are just to make things a little easier:
%
\newcommand{\bi}{\begin{itemize}}
\newcommand{\ei}{\end{itemize}}
\newcommand{\be}{\begin{enumerate}}
\newcommand{\ee}{\end{enumerate}}
\newcommand{\blank}{\vspace{1ex}} % generates a blank line in the output
%
% Use these for theorems, lemmas, proofs, etc.
%
\newtheorem{theorem}{Theorem}[chapter]
\newtheorem{lemma}[theorem]{Lemma}
\newtheorem{claim}[theorem]{Claim}
\newtheorem{corollary}[theorem]{Corollary}
\newcommand{\qed}{\hfill $\Box$}
\newenvironment{proof}{\par{\bf Proof:}}{\qed \par}
%\newenvironment{proof}{{\em Proof:}}{\hfill\rule{2mm}{2mm}}
%
% Use the following for definitions.
% \bigdef is for definitions to be set off by themselves; \smalldef is for
% definitions given in the middle of a paragraph.
%
\newenvironment{dfn}{{\vspace*{1ex} \noindent \bf Definition }}{\vspace*{1ex}}
\newcommand{\bigdef}[2]{\index{#1}\begin{dfn} {\rm #2} \end{dfn}}
\newcommand{\smalldef}[1]{\index{#1} {\em #1}}
% **** IF YOU WANT TO DEFINE ADDITIONAL MACROS FOR YOURSELF, PUT THEM HERE:
\begin{document}
%\lecture{**LECTURE-NUMBER**}{**1ST-PAGE**}{**DATE**}{**LECTURER**}{**SCRIBE**}
\lecture{10}{1}{December 30}{Yishay Mansour}{Libi Hertzberg, Uri
Schneider}
% **** YOUR NOTES GO HERE
\topic{TD-Gammon}{} \subtopic{MDPs with a very large number of
states}{} What happens when the number of states is too large to
enumerate?
For example: the game of backgammon.\\ Board description: there
are 24 positions on the board.\\ Pieces: each player has 15
pieces. \\The goal: to bring your pieces to your side of the
board, and then remove them.\\ Game play: In each turn a player
throws two dice, and moves his pieces accordingly.\\ Interaction:
\begin{enumerate}
\item If a white piece is the only one in its position, and a black piece reaches that position,
the white piece is removed, and needs to start from the beginning.
\item If two or more white pieces are in the same position, then a black piece can not "land" there.
\end{enumerate}
End of game: The one who removed all his pieces out of the board
first wins the game.
Lets try to estimate the number of states in backgammon:
Descrption of a state: For every position of 24 + 2, remember the
number of pieces of each color which are located there. Number of
states for one player's pieces: $\cong 2.5 \times 10
^{10}$.\\Total number of states $\cong 10^{20}$.
\subtopic{State encoding TD-Gammon}{} TD-Gammon uses a neural
network with 198 inputs. For each position and for each color
there are four inputs:
\begin{enumerate}
\item equals true if there is at least one piece present.
\item equals true if there are at least two pieces present.
\item equals true if there are at least three pieces present.
\item has a value of $\frac {n - 3}{2}$ if there are at least four pieces present.
\end{enumerate}
If no piece is present then all four inputs are false.\\ Two
additional inputs encode the number of pieces that were "taken"
for each color. Each one has a value of $\frac{n}{2}$ where $n$ is
the number of eaten pieces. Two other inputs encode the number of
pieces removed. Each one has a value of $\frac{n}{15}$ where $n$
is the number of pieces removed. Two last boolean inputs encode
for each player whether it is his turn currently. \subtopic{MDP
description}{}
\begin{enumerate}
\item The discount factor is set to $\lambda = 1$.
\item Immediate rewards:
\begin{enumerate}
\item In non-terminal states the immediate rewards are equal to 0.
\item In a winning terminal state the immediate reward equals 1.
\item In a losing terminal state the immediate reward equals 0.
\end{enumerate}
\end{enumerate}
I.e. the TD has a difference of $V(s_{t+1}) - V(s_{t})$ for all
non-terminal states.
In every move the parameters are changed in the direction of the
TD. Generally, assume we have a function $F(s,r) = V_{r}(s)$
,which gives each state $s$ a value according to $r$ ($r$ is
actually a program which gets a state $s$ as an input). We will
update $\overrightarrow{r}$ by the derivative of $V_{r}(s)$
according to $\overrightarrow{r}$. I.e. according to the vector
$[\frac{\partial}{\partial r_{1}}V_{r}(s),\ldots
,\frac{\partial}{\partial r_{k}}V_{r}(s)]$. Updating
$\overrightarrow{r}$ in this direction will hopefully change the
value of $V_{r}(s)$ in the "right" direction. TD tries to minimize
the difference between two succeeding states: assuming that
$V_{r}(s_{t+1}) > V_{r}(s_{t})$, we would like to "strengthen" the
weight of the action taken, and update in the direction of
$[V_{r}(s_{t+1}) - V_{r}(s_{t})]\nabla_{\overrightarrow{r}}
V_{t}(s_{t})$
For example, if $r$ is a table: $\overrightarrow{r} =
(r_{1},\ldots, r_{k})$, then $\nabla_{\overrightarrow{r}} V_{r}(s)
= (0, 0,\ldots, 1, 0,\ldots, 0)$, and the update will occur only
in the $s$'th entry of $r$.
TD Gammon updates $\overrightarrow{r}$ while running the system,
where the current policy is the greedy policy with respect to the
function $V_{r}(s)$.
Specifically, $\overrightarrow{r_{t+1}} = \overrightarrow{r_{t}} +
\alpha[V_{t}(s_{t+1}) - V_{t}(s_{t})]\cdot\overrightarrow{e_{t}}$,
where $\overrightarrow{e_{t}} =
\gamma\lambda\overrightarrow{e_{t-1}} +
\nabla_{\overrightarrow{r_{t}}}V_{t}(s_{t})$
TD-Gammon simply "learns" by playing against itself, i.e it updated $\overrightarrow{r_{t}}$ while playing against itself. %%@
After about $300,000$ games the system achieved a very good
playing skill (equivalent to other backgammon playing programs).
\subtopic{"Encoding" Functions $F:S\rightarrow\Re$}{}
One way to implement $F$ is by using a table. This enables us to implement {\em any} function $F$. The drawback in this %%@
approach is that the space needed is proportional to the number of states. Since, in backgammon, the number of states is %%@
very large ($\cong10^{20}$), using a table is impractical.
We will implement $F$ in way which does {\em not} enable any
mapping $S\rightarrow\Re$. First, we choose a mapping
$H:S\rightarrow U$, which for each state $s$ maps a vector
$\overrightarrow{u}$, which will "represent" it. Choosing such
appropriate mapping $h$ determines the performance of the
algorithm. For example:
\begin{enumerate}
\item TD-Gammon - $\overrightarrow{u}$ is a vector with 198 slots.
\item 21 (blackjack) - $u$ is the sum of cards.
\end {enumerate}
From here on, we exchange $s$ with $\overrightarrow{u}=H(s)$, and
perform the calculations on the vector $\overrightarrow{u}$.
In the game of 21 there were few such vectors
$\overrightarrow{u}$, and we could build a table for all of them.
It is not the case for backgammon.
\subsubtopic{methods of encoding}{}
\be
\item {\em Linear Function}
We choose a weight function, $\overrightarrow{w}$, for which the
value function is
$V_{w}(s)=\overrightarrow{w}\cdot\overrightarrow{H(s)}=\sum
w_{i}u_{i}$. Naturally, we cannot calculate \em{every} function
this way, but in many cases it gives good results. Another
advantage is the simplicity of calculating the derivative
$\nabla_{w}V_{w}(s)=\nabla_{w}\overrightarrow{w}\cdot\overrightarrow{u}=\overrightarrow{u}$.
The derivative is the encoding of the state, which is very
convenient computation-wise.
\item {\em Neural Networks} (figure \ref{L10:NN1})
\ee
\begin{figure}
\begin{picture}(400,200)(0,100) % begins picture environment
\put(200,262.5){\circle{25}} \put(200,262.5){\makebox(0,0)[c]{s}}
\put(100,187.5){\circle{25}} \put(100,187.5){\makebox(0,0)[c]{s}}
\put(200,187.5){\circle{25}} \put(200,187.5){\makebox(0,0)[c]{s}}
\put(300,187.5){\circle{25}} \put(300,187.5){\makebox(0,0)[c]{s}}
\put(200,250){\line(-2,-1){100}} \put(200,250){\line(0,-1){50}}
\put(200,250){\line(2,-1){100}}
\put(100,175){\line(-1,-1){30}} \put(200,175){\line(-1,-1){30}}
\put(300,175){\line(-1,-1){30}}
\put(100,175){\line(0,-1){30}} \put(200,175){\line(0,-1){30}}
\put(300,175){\line(0,-1){30}}
\put(100,175){\line(1,-1){30}} \put(200,175){\line(1,-1){30}}
\put(300,175){\line(1,-1){30}}
\put(30,187.5){\makebox(0,0)[c]{sigmoid$\rightarrow$}}
\put(30,145){\makebox(0,0)[c]{inputs$\rightarrow$}}
\end{picture}
\caption{A Neural Network}
\label{L10:NN1}
\end{figure}
\begin{figure}
\begin{picture}(400,200)(0,100) % begins picture environment
\put(200,262.5){\circle{25}} \put(200,262.5){\makebox(0,0)[c]{s}}
\put(200,250){\line(-2,-1){100}} \put(200,250){\line(0,-1){50}}
\put(200,250){\line(2,-1){100}}
\put(100,187.5){\makebox(0,0)[c]{$z_{1}$}}
\put(200,187.5){\makebox(0,0)[c]{$z_{2}$}}
\put(300,187.5){\makebox(0,0)[c]{$z_{3}$}}
\put(120,220){\makebox(0,0)[c]{$w_{1}$}}
\put(188,220){\makebox(0,0)[c]{$w_{2}$}}
\put(280,220){\makebox(0,0)[c]{$w_{3}$}}
\end{picture}
\caption{Calculating The Value Of A Gate}
\label{L10:NN2}
\end{figure}
Calculation of a gate (see figure \ref{L10:NN2}): \\ First, we
compute $\alpha=\sum w_{i}z_{i}$. Then give $\alpha$ as an
argument to a non linear function.
\be
\item perceptron : $h(\alpha) = \{^{1\ \ ,\alpha > 0} _{0\ \ ,\alpha \leq
0}$ \\ The problem: it isn't a continuous function.
\item sigmoid function: a continuous perceptron approximation,
\[\sigma(\alpha)=\frac{1}{1+e^{-\alpha}}\ .\]
Note that, $\sigma(0)=\frac{1}{2}$, and when $\alpha \rightarrow
\infty$, $\sigma(\alpha)\rightarrow1$, and when $\alpha
\rightarrow -\infty$, $\sigma(\alpha)\rightarrow0$. \\\\ It is
possible to connect a large number of such gates, each gate has
its own weight vector $w_{i}$. There are simple algorithms for
computing the derivative by using the chain rule. \ee
Basically, we are left with a learning problem: finding $F(r,s)$
that corresponds to $V^{*}$. We are interested in two things:
\bi
\item $V^{*} =\ ^{\max}_{\ \pi}\{V^{\pi}\}$ equivalent to $^{\max}_{\ r}\{F(r,s)\}$
\item $V^{*} \cong F(r^{*},s)$
\ei
\subtopic{Choosing the parameters for $F(r,s)$}{} If it is
possible, we would like to find a vector $r$ such that:
\[\forall s:\ F(r,s) = V^{\pi}(s)\]
In most cases, it isn't possible since there aren't sufficient
computing resources. Therefore, we should discuss the distance
between $F$ and $V^{\pi}$. One way to measure this distance is to
calculate the minimum square error (MSE):
\[^{\min}_{\ r}E_{s}[(F(r,s)-V^{\pi}(s))^{2}]\]
If it is possible to encode $V^{\pi}$ using $F(r,s)$, this minimum
equals to 0. Otherwise, we try to get as close as possible to
$V^{\pi}(s)$. For linear functions it is sometimes possible to
compute $\overrightarrow{r}$ which minimizes the MSE, but
generally, it is a difficult task.
Let's look at the problem in the following manner: we try to
minimize
\[G(r,s)=E_{s}[(F(r,s)-V^{\pi}(s))^{2}]\ .\]
Finding the global minimum cannot always be done efficiently, and
it is much easier to find a local minimum (and hope it is good
enough).To find a local minimum, we can follow the derivative of
$G(r,s)$. As long as the derivative is non zero, we have a
direction, in which $G(r,s)$ is descending. When the value of the
derivative equals to zero, we are done. We now only need to
establish that this is a minimum (and not a maximum or an
inflection point). The reason that this is not a maximum value is
that the function descends to this point. However, we could be at
an inflection point, and remain there (because the derivative
could be fixed on zero in that neighborhood).
\[\overrightarrow{r_{t+1}} =
\overrightarrow{r_{t}}-\frac{\alpha}{2}\nabla_{\overrightarrow{r_{t}}}[V^{\pi}(s_{t})-V_{t}(s_{t})]^{2}
= \overrightarrow{r_{t}} +
\alpha[V^{\pi}(s_{t})-V_{t}(s_{t})]\cdot\nabla_{\overrightarrow{r_{t}}}V_{t}(s_{t})\]
Recall that:
\[\nabla_{\overrightarrow{r_{t}}}V_{t}(s_{t}) =
(\frac{\partial}{\partial
r_{1}}V_{t}(s_{t}),\ldots,\frac{\partial}{\partial
r_{k}}V_{t}(s_{t}))\ .\] Of course, we don't have access to
$V^{\pi}(s_{t})$, but rather to a sample of it $v_{t}$ (which
could be noisy). Thus, we have:
\[\overrightarrow{r_{t+1}} = \overrightarrow{r_{t}} +
\alpha[v_{t}-V_{t}(s_{t})]\cdot\nabla_{\overrightarrow{r_{t}}}V_{t}(s_{t})\]
For example, $v_{t}$ could be a Monte-Carlo estimate, i.e. the
total reward from time $t$ onwards $R_{t}$.
In the case of backgammon, $v_{t}$ denotes whether we won or lost
the game. Similarly,
\[\overrightarrow{r_{t+1}} =
\overrightarrow{r_{t}} +
\alpha[R^{\lambda}_{t}-V_{t}(s_{t})]\cdot\nabla_{\overrightarrow{r_{t}}}V_{t}(s_{t})\]
\topic{TD-Gammon}{} Let $V^{*}(s,0)$ be the probability of white
winning from state $s$ and it is white's turn (assuming white and
black are playing optimally). Let $V^{*}(s,1)$ be the probability
of white winning from state $s$ and it is black's turn.
We estimate $V^{*}(s,l)$ using a neural network which calculates
$\mathaccent'176{V}(s,l,r)$. \\
{\em The Neural Network:}
\bi
\item 198 inputs.
\item 40 nodes in the second level.
\item 1 output node in the third level. \\
\ei
{\em Network Initialization:} (small) random weights. \\
{\em Training Method:} The program plays for both sides. For each
point in time we have a state $s_{t}$, a vector $r_{t}$, and a
turn $l_{t}$.
For each state $s'$ accessible from $s_{t}$ (according to the
dice), we calculate $\mathaccent'176{V}(s',l_{t},r_{t})$, and
choose the best state. (For white's turn choose the state
corresponding to the maximum value; for black's - the minimum.) \\
{\em Updating Parameters:} At the end of each turn, we compute:
\begin{displaymath}
d_{t}=\mathaccent'176{V}(s_{t+1},l_{t+1},r_{t}) -
\mathaccent'176{V}(s_{t},l_{t},r_{t})\ ,
\end{displaymath}
which is the TD (temporal difference). (In the final state we
replace $\mathaccent'176{V}$ with the game outcome.) In addition,
we update $\overrightarrow{r_{t}}$:
\begin{displaymath}
\overrightarrow{r_{t+1}}\leftarrow
\overrightarrow{r_{t}}+\alpha\cdot
d_{t}\underbrace{\Sigma_{k=0}^{t} \gamma^{t-k} \nabla_{r_{k}}
\mathaccent'176{V}(s_{k},l_{k},r_{k})}\ .
\end{displaymath}
\[\hspace{1.3in}\overrightarrow{e_{t}}\]
At the end of each game a new game is started, and $r_{0}$ is set
to the previous game's parameter vector.
\be
\item $\alpha$ is set to a constant (determined by experiments).
\item $\gamma$ does not affect the results significantly in this case.
\ee
At the end of the training phase, we get a function
$\mathaccent'176{V}(s,l,r)$ ($r$ is fixed) which we can use to
play backgammon. \\
{\em Improvements:}
\bi
\item Instead if 40 nodes at the second level, we add 40 more (80
total), the additional 40 units are set to represent important
patterns for backgammon.
\item After $\mathaccent'176{V}$ is set, it can be used
immediately (one step), or, alternatively, we can look a few steps
ahead, by building a game search tree. \ei
\newpage
{\em Comments:}
\bi
\item $\mathaccent'176{V}$ wasn't a good estimate for the
probability of white winning, but the policy derived from it is a
very good one. (the reason for this is as yet unclear.)
\item The chosen policy is totally greedy (selected
deterministically). However, in backgammon there is a lot of
randomness, because of the dice. This enables us to "explore",
although we do not perform this explicitly.
\item TD-Gammon has achieved a skill level close to the level of
the best players in the world today! \ei
\topic{Calculating the Derivative of a Neural Network}{}
\begin{figure}
\begin{picture}(400,200)(0,100) % begins picture environment
\put(200,262.5){\circle{25}} \put(200,262.5){\makebox(0,0)[c]{s}}
\put(200,275){\vector(0,1){20}}
\put(205,285){\makebox(0,0)[c]{$z$}}
\put(100,187.5){\circle{25}} \put(100,187.5){\makebox(0,0)[c]{s}}
\put(200,187.5){\circle{25}} \put(200,187.5){\makebox(0,0)[c]{s}}
\put(300,187.5){\circle{25}} \put(300,187.5){\makebox(0,0)[c]{s}}
\put(200,250){\line(-2,-1){100}} \put(200,250){\line(0,-1){50}}
\put(200,250){\line(2,-1){100}}
\put(120,220){\makebox(0,0)[c]{$w_{1}$}}
\put(190,220){\makebox(0,0)[c]{$w_{2}$}}
\put(280,220){\makebox(0,0)[c]{$w_{3}$}}
\put(91,203){\makebox(0,0)[c]{$y_{1}$}}
\put(193,203){\makebox(0,0)[c]{$y_{2}$}}
\put(309,203){\makebox(0,0)[c]{$y_{3}$}}
\put(100,175){\line(-1,-1){30}} \put(100,175){\line(0,-1){30}}
\put(100,175){\line(1,-1){30}}
\put(78,160){\makebox(0,0)[c]{$u_{1}$}}
\put(107,160){\makebox(0,0)[c]{$u_{2}$}}
\put(123,160){\makebox(0,0)[c]{$u_{3}$}}
\put(100,130){\makebox(0,0)[c]{$\overrightarrow{x}$}}
\put(30,187.5){\makebox(0,0)[c]{sigmoid$\rightarrow$}}
\end{picture}
\caption{Calculating the Derivative of a Neural Network}
\label{L10:NN3}
\end{figure}
Let $z$ be the output of the neural network. \\ Let
$w_{1},\ldots,w_{l}$ be the weights of the inputs from the second
level to the output gate. \\ Let $y_{1},\ldots,y_{l}$ be the
outputs of the gates in the second level. \\ Let
$u_{i1},\ldots,u_{ik_{i}}$ be the weights of the inputs to gate
$y_{i}$. \\ Let $x_{1},\ldots,x_{k}$ be the inputs to the neural
network. \\ (see figure \ref{L10:NN3})
\begin{displaymath}
z = F(\overrightarrow{x})=\sigma(\Sigma w_{i}
\sigma(\overrightarrow{u_{i}}\overrightarrow{x_{i}}))
\end{displaymath}
\begin{displaymath}
\frac{\partial z}{\partial u_{ij}} = \frac{\partial z}{\partial
y_{i}} \cdot \frac{\partial y_{i}}{\partial u_{ij}}
\end{displaymath}
In this case:
\begin{displaymath}
\sigma(x) = \frac{1}{1+e^{-x}}
\end{displaymath}
\begin{displaymath}
\frac{\partial}{\partial u_{ij}}\sigma(x) =
\frac{e^{-x}}{(1+e^{-x})^{2}} = e^{-x}\cdot\sigma^{2}(x)
\end{displaymath}
\begin{displaymath}
\frac{\partial}{\partial y_{i}}\sigma(\Sigma y_{j}w_{j}) =
w_{i}\cdot e^{-\Sigma y_{j}w_{j}}\cdot\sigma^{2}(\Sigma y_{j}w_{j})
\end{displaymath}
\end{document}