# master_thesis.bib

@article{Bogovic2013,
abstract = {For image recognition and labeling tasks, recent results suggest that machine learning methods that rely on manually specified feature representations may be outperformed by methods that automatically derive feature representations based on the data. Yet for problems that involve analysis of 3d objects, such as mesh segmentation, shape retrieval, or neuron fragment agglomeration, there remains a strong reliance on hand-designed feature descriptors. In this paper, we evaluate a large set of hand-designed 3d feature descriptors alongside features learned from the raw data using both end-to-end and unsupervised learning techniques, in the context of agglomeration of 3d neuron fragments. By combining unsupervised learning techniques with a novel dynamic pooling scheme, we show how pure learning-based methods are for the first time competitive with hand-designed 3d shape descriptors. We investigate data augmentation strategies for dramatically increasing the size of the training set, and show how combining both learned and hand-designed features leads to the highest accuracy.},
archiveprefix = {arXiv},
arxivid = {arXiv:1312.6159v1},
author = {Bogovic, JA and Huang, GB and Jain, Viren},
eprint = {arXiv:1312.6159v1},
file = {:home/perellm1/.local/share/data/Mendeley Ltd./Mendeley Desktop/Downloaded/Bogovic, Huang, Jain - 2013 - Learned versus Hand-Designed Feature Representations for 3d Agglomeration.pdf:pdf},
journal = {arXiv preprint arXiv:1312.6159},
keywords = {mscthesis},
mendeley-tags = {mscthesis},
pages = {1--14},
title = {{Learned versus Hand-Designed Feature Representations for 3d Agglomeration}},
url = {http://arxiv.org/abs/1312.6159},
year = {2013}
}

@inproceedings{Chai1998,
abstract = {This paper addresses our proposed method to automatically locate the person's face from a given image that consists of a head-and-shoulders view of the person and a complex background scene. The method involves a fast, simple and yet robust algorithm that exploits the spatial distribution characteristics of human skin color. It first uses the chrominance component of the input image to detect pixels with skin color appearance. Then, bused on the spatial distribution of the detected skin-color pixels and their corresponding luminance values, the algorithm employs some regularization processes to reinforce regions of skin-color pixels that are more likely to belong to the facial regions and eliminate those that are not. The performance of the face localization algorithm is illustrated by some simulation results carried out on various head-and-shoulders test images},
author = {Chai, D. and Ngan, K.N.},
doi = {10.1109/AFGR.1998.670936},
keywords = {Australia,Chromium,Color,Digital simulation,Face detection,Humans,Layout,Skin,Visual communication,chrominance component,data mining,face recognition,facial region location,head-and-shoulders color image,image segmentation,mscthesis,robust algorithm,simulation results,skin-color pixels,spatial distribution characteristics},
mendeley-tags = {Australia,Chromium,Color,Digital simulation,Face detection,Humans,Layout,Skin,Visual communication,chrominance component,data mining,face recognition,facial region location,head-and-shoulders color image,image segmentation,mscthesis,robust algorithm,simulation results,skin-color pixels,spatial distribution characteristics},
month = apr,
pages = {124--129},
title = {{Locating facial region of a head-and-shoulders color image}},
url = {http://ieeexplore.ieee.org/ielx4/5501/14786/00670936.pdf?tp=\&arnumber=670936\&isnumber=14786 http://ieeexplore.ieee.org/xpls/abs\_all.jsp?arnumber=670936},
year = {1998}
}

@article{Csurka2004,
author = {Csurka, Gabriella and Dance, C},
file = {:share/imagedb/perellm1/references/Csurka, Dance\_2004\_Visual categorization with bags of keypoints.pdf:pdf},
journal = {Workshop on statistical \ldots},
keywords = {mscthesis},
mendeley-tags = {mscthesis},
title = {{Visual categorization with bags of keypoints}},
year = {2004}
}

@article{King-Smith1976,
abstract = {We show how the processes of visual detection and of temporal and spatial summation may be analyzed in terms of parallel luminance (achromatic) and opponent-color systems; a test flash is detected if it exceeds the threshold of either system. The spectral sensitivity of the luminance system may be determined by a flicker method, and has a single broad peak near 555 nm; the spectral sensitivity of the opponent-color system corresponds to the color recognition threshold, and has three peaks at about 440, 530, and 600 nm (on a white background). The temporal and spatial integration of the opponent-color system are generally greater than for the luminance system; further, a white background selectively depresses the sensitivity of the luminance system relative to the opponent-color system. Thus relatively large (1°) and long (200 msec) spectral test flashes on a white background are detected by the opponent-color system except near 570 nm; the contribution of the luminance system becomes more prominent if the size or duration of the test flash is reduced, or if the white background is extinguished. The present analysis is discussed in relation to Stiles’ model of independent $\pi$ mechanisms.},
author = {King-Smith, P. E. and Carden, D.},
doi = {10.1364/JOSA.66.000709},
journal = {Journal of the Optical Society of America},
keywords = {mscthesis},
mendeley-tags = {mscthesis},
month = jul,
number = {7},
pages = {709--717},
title = {{Luminance and opponent-color contributions to visual detection and adaptation and to temporal and spatial integration}},
url = {http://www.opticsinfobase.org/abstract.cfm?URI=josa-66-7-709 http://www.opticsinfobase.org/DirectPDFAccess/59E076B8-DF7E-6650-4C77737299DE2E24\_56181/josa-66-7-709.pdf?da=1\&id=56181\&seq=0\&mobile=no http://www.opticsinfobase.org/josa/abstract.cfm?uri=josa-66-7-709},
volume = {66},
year = {1976}
}

@article{Le2011a,
abstract = {Previous work on action recognition has focused on adapting hand-designed local features, such as SIFT or HOG, from static images to the video domain. In this paper, we propose using unsupervised feature learning as a way to learn features directly from video data. More specifically, we present an extension of the Independent Subspace Analysis algorithm to learn invariant spatio-temporal features from unlabeled video data. We discovered that, despite its simplicity, this method performs surprisingly well when combined with deep learning techniques such as stacking and convolution to learn hierarchical representations. By replacing hand-designed features with our learned features, we achieve classification results superior to all previous published results on the Hollywood2, UCF, KTH and YouTube action recognition datasets. On the challenging Hollywood2 and YouTube action datasets we obtain 53.3\% and 75.8\% respectively, which are approximately 5\% better than the current best published results. Further benefits of this method, such as the ease of training and the efficiency of training and prediction, will also be discussed. You can download our code and learned spatio-temporal features here: http://ai.stanford.edu/\~{}wzou/.},
author = {Le, Quoc V. and Zou, Will Y. and Yeung, Serena Y. and Ng, Andrew Y.},
doi = {10.1109/CVPR.2011.5995496},
file = {:home/perellm1/.local/share/data/Mendeley Ltd./Mendeley Desktop/Downloaded/Le et al. - 2011 - Learning hierarchical invariant spatio-temporal features for action recognition with independent subspace analysis(2).pdf:pdf},
isbn = {978-1-4577-0394-2},
journal = {Cvpr 2011},
keywords = {mscthesis},
mendeley-tags = {mscthesis},
month = jun,
pages = {3361--3368},
publisher = {Ieee},
title = {{Learning hierarchical invariant spatio-temporal features for action recognition with independent subspace analysis}},
url = {http://ieeexplore.ieee.org/lpdocs/epic03/wrapper.htm?arnumber=5995496},
year = {2011}
}

@inproceedings{LeCun1985,
author = {LeCun, Yann},
booktitle = {Proceedings of Cognitiva},
file = {:share/imagedb/perellm1/references/LeCun\_1985\_Une proc\'{e}dure d'apprentissage pour r\'{e}seau a seuil asymmetrique (a Learning Scheme for Asymmetric Threshold Networks).pdf:pdf},
keywords = {mscthesis},
mendeley-tags = {mscthesis},
pages = {599--604},
title = {{Une proc\'{e}dure d'apprentissage pour r\'{e}seau a seuil asymmetrique (a Learning Scheme for Asymmetric Threshold Networks)}},
url = {http://yann.lecun.com/exdb/publis/pdf/lecun-85.pdf},
year = {1985}
}

@book{Murphy2012,
author = {Murphy, KP},
file = {:home/perellm1/.local/share/data/Mendeley Ltd./Mendeley Desktop/Downloaded/Murphy - 2012 - Machine learning a probabilistic perspective.pdf:pdf},
isbn = {9780262018029},
keywords = {mscthesis},
mendeley-tags = {mscthesis},
title = {{Machine learning: a probabilistic perspective}},
url = {http://dl.acm.org/citation.cfm?id=2380985},
year = {2012}
}

@article{Reddy2012,
abstract = {Action recognition on large categories of unconstrained videos taken from the web is a very challenging problem compared to datasets like KTH (6 actions), IXMAS (13 actions), and Weizmann (10 actions). Challenges like camera motion, different viewpoints, large interclass variations, cluttered background, occlusions, bad illumination conditions, and poor quality of web videos cause the majority of the state-of-the-art action recognition approaches to fail. Also, an increased number of categories and the inclusion of actions with high confusion add to the challenges. In this paper, we propose using the scene context information obtained from moving and stationary pixels in the key frames, in conjunction with motion features, to solve the action recognition problem on a large (50 actions) dataset with videos from the web. We perform a combination of early and late fusion on multiple features to handle the very large number of categories. We demonstrate that scene context is a very important feature to perform action recognition on very large datasets. The proposed method does not require any kind of video stabilization, person detection, or tracking and pruning of features. Our approach gives good performance on a large number of action categories; it has been tested on the UCF50 dataset with 50 action categories, which is an extension of the UCF YouTube Action (UCF11) dataset containing 11 action categories. We also tested our approach on the KTH and HMDB51 datasets for comparison.},
annote = {Test on UCF50},
author = {Reddy, Kishore K. and Shah, Mubarak},
doi = {10.1007/s00138-012-0450-4},
file = {:home/perellm1/.local/share/data/Mendeley Ltd./Mendeley Desktop/Downloaded/Reddy, Shah - 2012 - Recognizing 50 human action categories of web videos.pdf:pdf},
issn = {0932-8092},
journal = {Machine Vision and Applications},
keywords = {action recognition,fusion,mscthesis,web videos},
mendeley-tags = {mscthesis},
month = nov,
number = {5},
pages = {971--981},
title = {{Recognizing 50 human action categories of web videos}},
volume = {24},
year = {2012}
}

@article{Serre,
abstract = {We introduce a novel set of features for robust object recognition. Each element of this set is a complex feature obtained by combining position- and scale-tolerant edge-detectors over neighboring positions and multiple orientations. Our system's architecture is motivated by a quantitative model of visual cortex. We show that our approach exhibits excellent recognition performance and outperforms several state-of-the-art systems on a variety of image datasets including many different object categories. We also demonstrate that our system is able to learn from very few examples. The performance of the approach constitutes a suggestive plausibility proof for a class of feedforward models of object recognition in cortex.},
author = {Serre, T. and Wolf, L. and Poggio, T.},
doi = {10.1109/CVPR.2005.254},
file = {:home/perellm1/.local/share/data/Mendeley Ltd./Mendeley Desktop/Downloaded/Serre, Wolf, Poggio - 2005 - Object Recognition with Features Inspired by Visual Cortex.pdf:pdf},
isbn = {0-7695-2372-2},
journal = {2005 IEEE Computer Society Conference on Computer Vision and Pattern Recognition (CVPR'05)},
keywords = {mscthesis},
mendeley-tags = {mscthesis},
pages = {994--1000},
publisher = {Ieee},
title = {{Object Recognition with Features Inspired by Visual Cortex}},
url = {http://ieeexplore.ieee.org/lpdocs/epic03/wrapper.htm?arnumber=1467551},
volume = {2},
year = {2005}
}

@article{Simonyan2014,
abstract = {In this work we investigate the effect of the convolutional network depth on its accuracy in the large-scale image recognition setting. Our main contribution is a thorough evaluation of networks of increasing depth, which shows that a significant improvement on the prior-art configurations can be achieved by pushing the depth to 16--19 weight layers. These findings were the basis of our ImageNet Challenge 2014 submission, where our team secured the first and the second places in the localisation and classification tracks respectively. We also show that our representations generalise well to other datasets, where they achieve the state-of-the-art results. Importantly, we have made our two best-performing ConvNet models publicly available to facilitate further research on the use of deep visual representations in computer vision.},
author = {Simonyan, Karen and Zisserman, Andrew},
journal = {arXiv preprint arXiv:1409.1556},
keywords = {Computer Science - Computer Vision and Pattern Rec,mscthesis},
mendeley-tags = {Computer Science - Computer Vision and Pattern Rec,mscthesis},
month = sep,
title = {{Very deep convolutional networks for large-scale image recognition}},
url = {http://arxiv.org/abs/1409.1556 http://www.arxiv.org/pdf/1409.1556.pdf},
year = {2014}
}

@article{Aas1999,
author = {Aas, K and Eikvil, L},
file = {:share/imagedb/perellm1/references/Aas, Eikvil\_1999\_Text categorisation A survey.pdf:pdf},
journal = {Raport NR},
keywords = {mscthesis},
mendeley-tags = {mscthesis},
title = {{Text categorisation: A survey}},
url = {http://www.oocities.org/rr\_andres/docs/aas99text.pdf},
year = {1999}
}

@article{Andrews1995a,
abstract = {It is becoming increasingly apparent that, without some form of explanation capability, the full potential of trained artificial neural networks (ANNs) may not be realised. This survey gives an overview of techniques developed to redress this situation. Specifically, the survey focuses on mechanisms, procedures, and algorithms designed to insert knowledge into ANNs (knowledge initialisation), extract rules from trained ANNs (rule extraction), and utilise ANNs to refine existing rule bases (rule refinement). The survey also introduces a new taxonomy for classifying the various techniques, discusses their modus operandi, and delineates criteria for evaluating their efficacy.},
author = {Andrews, Robert and Diederich, Joachim and Tickle, Alan B.},
doi = {10.1016/0950-7051(96)81920-4},
issn = {0950-7051},
journal = {Knowledge-Based Systems},
keywords = {fuzzy neural networks,inferencing,knowledge insertion,mscthesis,rule extraction,rule refinement},
mendeley-tags = {fuzzy neural networks,inferencing,knowledge insertion,mscthesis,rule extraction,rule refinement},
month = dec,
number = {6},
pages = {373--389},
series = {Knowledge-based neural networks},
title = {{Survey and critique of techniques for extracting rules from trained artificial neural networks}},
url = {http://www.sciencedirect.com/science/article/pii/0950705196819204 http://www.sciencedirect.com/science/article/pii/0950705196819204/pdf?md5=e9cc26be6117d09b0795c6763a768969\&pid=1-s2.0-0950705196819204-main.pdf},
volume = {8},
year = {1995}
}

@article{Ba2013,
abstract = {Currently, deep neural networks are the state of the art on problems such as speech recognition and computer vision. In this extended abstract, we show that shallow feed-forward networks can learn the complex functions previously learned by deep nets and achieve accuracies previously only achievable with deep models. Moreover, in some cases the shallow neural nets can learn these deep functions using a total number of parameters similar to the original deep model. We evaluate our method on the TIMIT phoneme recognition task and are able to train shallow fully-connected nets that perform similarly to complex, well-engineered, deep convolutional architectures. Our success in training shallow neural nets to mimic deeper models suggests that there probably exist better algorithms for training shallow feed-forward nets than those currently available.},
archiveprefix = {arXiv},
arxivid = {arXiv:1312.6184v5},
author = {Ba, LJ and Caurana, R},
eprint = {arXiv:1312.6184v5},
file = {:home/perellm1/.local/share/data/Mendeley Ltd./Mendeley Desktop/Downloaded/Ba, Caurana - 2013 - Do Deep Nets Really Need to be Deep.pdf:pdf},
journal = {arXiv preprint arXiv:1312.6184},
keywords = {mscthesis},
mendeley-tags = {mscthesis},
pages = {1--6},
title = {{Do Deep Nets Really Need to be Deep?}},
url = {http://arxiv.org/abs/1312.6184},
year = {2013}
}

@article{Barto1983,
abstract = {It is shown how a system consisting of two neuronlike adaptive elements can solve a difficult learning control problem. The task is to balance a pole that is hinged to a movable cart by applying forces to the cart's base. It is argued that the learning problems faced by adaptive elements that are components of adaptive networks are at least as difficult as this version of the pole-balancing problem. The learning system consists of a single associative search element (ASE) and a single adaptive critic element (ACE). In the course of learning to balance the pole, the ASE constructs associations between input and output by searching under the influence of reinforcement feedback, and the ACE constructs a more informative evaluation function than reinforcement feedback alone can provide. The differences between this approach and other attempts to solve problems using neurolike elements are discussed, as is the relation of this work to classical and instrumental conditioning in animal learning studies and its possible implications for research in the neurosciences.},
author = {Barto, A.G. and Sutton, Richard S. and Anderson, Charles W.},
doi = {10.1109/TSMC.1983.6313077},
issn = {0018-9472},
journal = {Systems, Man and \ldots},
month = sep,
number = {5},
pages = {834--846},
title = {{Neuronlike adaptive elements that can solve difficult learning control problems}},
url = {http://ieeexplore.ieee.org/ielx5/21/6313056/06313077.pdf?tp=\&arnumber=6313077\&isnumber=6313056 http://ieeexplore.ieee.org/xpl/articleDetails.jsp?tp=\&arnumber=6313077 http://ieeexplore.ieee.org/xpls/abs\_all.jsp?arnumber=6313077},
volume = {SMC-13},
year = {1983}
}

@article{Bay2008,
abstract = {This article presents a novel scale- and rotation-invariant detector and descriptor, coined SURF (Speeded-Up Robust Features). SURF approximates or even outperforms previously proposed schemes with respect to repeatability, distinctiveness, and robustness, yet can be computed and compared much faster. This is achieved by relying on integral images for image convolutions; by building on the strengths of the leading existing detectors and descriptors (specifically, using a Hessian matrix-based measure for the detector, and a distribution-based descriptor); and by simplifying these methods to the essential. This leads to a combination of novel detection, description, and matching steps. The paper encompasses a detailed description of the detector and descriptor and then explores the effects of the most important parameters. We conclude the article with SURF’s application to two challenging, yet converse goals: camera calibration as a special case of image registration, and object recognition. Our experiments underline SURF’s usefulness in a broad range of topics in computer vision.},
author = {Bay, Herbert and Ess, Andreas and Tuytelaars, Tinne and Gool, L Van},
doi = {10.1016/j.cviu.2007.09.014},
issn = {1077-3142},
journal = {Computer vision and image \ldots},
keywords = {Camera calibration,Feature description,Interest points,Local features,mscthesis,object recognition},
mendeley-tags = {Camera calibration,Feature description,Interest points,Local features,mscthesis,object recognition},
month = jun,
number = {3},
pages = {346--359},
series = {Similarity Matching in Computer Vision and Multimedia},
title = {{Speeded-up robust features (SURF)}},
url = {http://www.sciencedirect.com/science/article/pii/S1077314207001555 http://www.sciencedirect.com/science/article/pii/S1077314207001555/pdf?md5=62fa16e248e21119bcc61ec206254348\&pid=1-s2.0-S1077314207001555-main.pdf},
volume = {110},
year = {2008}
}

@article{Bell1995,
abstract = {We derive a new self-organizing learning algorithm that maximizes the information transferred in a network of nonlinear units. The algorithm does not assume any knowledge of the input distributions, and is defined here for the zero-noise limit. Under these conditions, information maximization has extra properties not found in the linear case (Linsker 1989). The nonlinearities in the transfer function are able to pick up higher-order moments of the input distributions and perform something akin to true redundancy reduction between units in the output representation. This enables the network to separate statistically independent components in the inputs: a higher-order generalization of principal components analysis. We apply the network to the source separation (or cocktail party) problem, successfully separating unknown mixtures of up to 10 speakers. We also show that a variant on the network architecture is able to perform blind deconvolution (cancellation of unknown echoes and reverberation in a speech signal). Finally, we derive dependencies of information transfer on time delays. We suggest that information maximization provides a unifying framework for problems in "blind" signal processing.},
author = {Bell, AJ and Sejnowski, TJ},
issn = {0899-7667},
journal = {Neural computation},
keywords = {Algorithms,Humans,Learning,Models- Statistical,Neural Networks (Computer),Neurons,Probability,Problem Solving,Speech,mscthesis},
language = {eng},
mendeley-tags = {Algorithms,Humans,Learning,Models- Statistical,Neural Networks (Computer),Neurons,Probability,Problem Solving,Speech,mscthesis},
month = nov,
number = {6},
pages = {1129--1159},
title = {{An information-maximization approach to blind separation and blind deconvolution}},
url = {http://www.inf.fu-berlin.de/lehre/WS05/Mustererkennung/infomax/infomax.pdf http://www.ncbi.nlm.nih.gov/pubmed/7584893 http://www.mitpressjournals.org/doi/abs/10.1162/neco.1995.7.6.1129},
volume = {7},
year = {1995}
}

@article{Bengio2007,
abstract = {One long-term goal of machine learning research is to produce methods thatare applicable to highly complex tasks, such as perception (vision, audition), reasoning, intelligent control, and other artificially intelligent behaviors. We argue that in order to progress toward this goal, the Machine Learning community mustendeavor to discover algorithms that can learn highly complex functions, with minimal need for prior knowledge, and with minimal human intervention. We present mathematical and empirical evidence suggesting that many popular approaches to non-parametric learning, particularly kernel methods, are fundamentally limited in their ability to learn complex high-dimensional functions. Our analysis focuses on two problems. First, kernel machines are shallow architectures, in which one large layer of simple template matchers is followed by a single layer of trainable coefficients. We argue that shallow architectures can be very inefficient in terms of required number of computational elements and examples. Second, we analyze a limitation of kernel machines with a local kernel, linked to the curse of dimensionality, that applies to supervised, unsupervised (manifold learning) and semi-supervised kernel machines. Using empirical results on invariant image recognition tasks, kernel methods are compared with deep architectures, in which lower-level features or concepts are progressively combined into more abstract and higher-level representations. We argue that deep architectures have the potential to generalize in non-local ways, i.e., beyond immediate neighbors, and that this is crucial in order to make progress on the kind of complex tasks required for artificial intelligence},
author = {Bengio, Yoshua and LeCun, Y},
file = {:home/perellm1/.local/share/data/Mendeley Ltd./Mendeley Desktop/Downloaded/Bengio, LeCun - 2007 - Scaling learning algorithms towards AI.pdf:pdf},
journal = {Large-Scale Kernel Machines},
keywords = {mscthesis},
mendeley-tags = {mscthesis},
number = {1},
pages = {1--41},
title = {{Scaling learning algorithms towards AI}},
url = {http://www.iro.umontreal.ca/~lisa/bib/pub\_subject/language/pointeurs/bengio+lecun-chapter2007.pdf},
year = {2007}
}

@article{Bengio2013,
abstract = {We introduce a novel training principle for probabilistic models that is an alternative to maximum likelihood. The proposed Generative Stochastic Networks (GSN) framework is based on learning the transition operator of a Markov chain whose stationary distribution estimates the data distribution. The transition distribution of the Markov chain is conditional on the previous state, generally involving a small move, so this conditional distribution has fewer dominant modes, being unimodal in the limit of small moves. Thus, it is easier to learn because it is easier to approximate its partition function, more like learning to perform supervised function approximation, with gradients that can be obtained by backprop. We provide theorems that generalize recent work on the probabilistic interpretation of denoising autoencoders and obtain along the way an interesting justification for dependency networks and generalized pseudolikelihood, along with a definition of an appropriate joint distribution and sampling mechanism even when the conditionals are not consistent. GSNs can be used with missing inputs and can be used to sample subsets of variables given the rest. We validate these theoretical results with experiments on two image datasets using an architecture that mimics the Deep Boltzmann Machine Gibbs sampler but allows training to proceed with simple backprop, without the need for layerwise pretraining.},
archiveprefix = {arXiv},
arxivid = {1306.1091},
author = {Bengio, Yoshua and Thibodeau-Laufer, \'{E}ric and Alain, Guillaume and Yosinski, Jason},
eprint = {1306.1091},
file = {:home/perellm1/.local/share/data/Mendeley Ltd./Mendeley Desktop/Downloaded/Bengio et al. - 2013 - Deep Generative Stochastic Networks Trainable by Backprop.pdf:pdf},
keywords = {mscthesis},
mendeley-tags = {mscthesis},
month = jun,
title = {{Deep Generative Stochastic Networks Trainable by Backprop}},
url = {http://arxiv.org/abs/1306.1091},
year = {2013}
}

@article{Bernstein1981,
abstract = {ABSTRACT: PROFILE of Marvin Minsky, professor at MIT, who works in artificial intelligence. He was born in N.Y. in 1927. The term “artificial intelligence” is usually attributed to John McCarthy, a former colleague of Minsky's. He coined the phrase in the mid-fifties to describe the ability of certain machines to do things that people call intelligent. In 1958 McCarthy \& Minsky created the Artificial Intelligence Group at MIT, \& it soon became one of the most distinguished scientific enterprises in the world. Today about a hundred people work in the lab \& it gets some \$2.5 million a year from various government agencies. In 1968 the group formally became the A.I. Lab. Minsky became the director, a job he held until 1973 when he turned it over to Patrick Winston, one of his former students. Tells about Minsky's days at Harvard \& Princeton. Tells about his projects. His goal has been to learn what computers could do in solving non-arithmetic problemsNin short, to make them intelligent. At present, debate rages about what “artificial intelligence” really is. The most commonly accepted idea is that it's the attempt to produce a computer whose output resembles, or finally cannot be distinguished from, a human mind. To make such a machine is an enormous task. People have been attacking it in pieces: thus the attempts to make machines that play games, “understand” newspaper accounts, \& can recognize patterns. That machines can already do all these things, with varying degrees of success, is a fact. The debate is over what this means. If a humanoid machine were built, many argue it would not understand what it was doing, that it was only simulating intelligence while the real thing would always lie beyond it.}, annote = {p69 For a while, I studied topology, and then I ran into a young graduate student in physics named Dean Edmonds, who was a whiz at electronics. We began to build vacuum-tube circuits that did all sorts of things." As an undergraduate, Minsky had begun to imagine building an electronic machine that could learn. He had become fascinated by a paper that had been written, in 1943, by Warren S. McCulloch, a neurophysiologist, and Walter Pitts, a mathematical prodigy. In this paper, McCulloch and Pitts created an abstract model of the brain cells—the neurons—and showed how they might be connected to carry out mental processes such as learning. Minsky now thought that the time might be ripe to try to create such a machine. "I told Edmonds that I thought it might be too hard to build," he said. "The one I then envisioned would have needed a lot of memory circuits. There would be electronic neurons connected by synapses that would determine when the neurons fired. The synapses would have various probabilities for conducting. But to reinforce 'success' one would have to have a way of changing these probabilities. There would have to be loops and cycles in the circuits so that the machine could remember traces of its past and adjust its behavior. I thought that if I could ever build such a machine I might get it to learn to run mazes through its electronics— like rats or something. I didn't think that it would be very intelligent. I thought it would work pretty well with about forty neurons. Edmonds and I worked out some circuits so that —in principle, at least—we could realize each of these neurons with just six vacuum tubes and a motor." Minsky told George Miller, at Harvard, about the prospective design. "He said, 'Why don't we just try it?' " Minsky recalled. "He had a lot of faith in me, which I appreciated. Somehow, he managed to get a couple of thousand dollars from the Office of Naval Research, and in the summer of 1951 Dean Edmonds and I went up to Harvard and built our machine. It had three hundred tubes and a lot of motors. It needed some automatic electric clutches, which we machined ourselves. The memory of the machine as stored in the positions of its control knobs—forty of them—and when the machine was learning it used the clutches to adjust its own knobs. We used a surplus gyropilot from a B-24 bomber to move the clutches." Minsky's machine was certainly one of the first electronic learning machines, and perhaps the very first one. In addition to its neurons and synapses and its internal memory loops, many of the networks were wired at random, so that it was impossible to predict what it would do. A "rat" would be created at some point in the network and would then set out to learn a path to some specified end point. First, it would proceed randomly, and then correct choices would be reinforced by making it easier for the machine to make this choice again—to increase the probability of its doing so. There was an arrangement of lights that allowed observers to follow the progress of the rat—or rats. "It turned out that because of an electronic accident in our design we could put two or three rats in the same maze and follow them all," Minsky told me. "The rats actually interacted with one another. If one of them found a good path, the others would tend to follow it. We sort of quit science for a while to watch the machine. We were amazed that it could have several activities going on at once in its little nervous system. Because of the random wiring, it had a sort of fail-safe characteristic. If one of the neurons wasn't working, it wouldn't make much of a difference —and, with nearly three hundred tubes and the thousands of connections we had soldered, there would usually be something wrong somewhere. In those days, even a radio set with twenty tubes tended to fail a lot. I don't think we ever debugged our machine completely, but that didn't matter. By having this crazy random design, it was almost sure to work, no matter how you built it." Minsky went on, "My Harvard machine was basically Skinnerian, although Skinner, with whom I talked a great deal while I was building it, was never much interested in it. The unrewarded behavior of my machine was more or less random. This limited its learning ability. It could never formulate a plan. The next idea I had, which I worked on for my doctoral thesis, was to give the network a second memory, which remembered after a response what the stimulus had been. This enabled one to bring in the idea of prediction. If the machine or animal is confronted with a new situation, it can search its memory to see what would happen if it reacted in certain ways. If, say, there was an unpleasant association with a certain stimulus, then the machine could choose a different response. I had the naive idea that if one could build a big enough network, with enough memory loops, it might get lucky and acquire the ability to envision things in its head. This became a field of study later. It was called self-organizing random networks. Even today, I still get letters from young students who say, 'Why are you people trying to program intelligence? Why don't you try to find a way to build a nervous system that will just spontaneously create it?' Finally, I decided that either this was a bad idea or it would take thousands or millions of neurons to make it work, and I couldn't afford to try to build a machine like that." I asked Minsky why it had not occurred to him to use a computer to simulate his machine. By this time, the first electronic digital computer— named ENIAC, for "electronic numerical integrator and calculator"—had been built, at the University of Pennsylvania's Moore School of Electrical Engineering; and the mathematician John von Neumann was completing work on a computer, the prototype of many present-day computers, at the Institute for Advanced Study. "I knew a little bit about computers," Minsky answered. "At Harvard, I had even taken a course with Howard Aiken"—one of the first computer designers. "Aiken had built an electromechanical machine in the early forties. It had only about a hundred memory registers, and even von Neumann's machine had only a thousand. On the one hand, I was afraid of the complexity of these machines. On the other hand, I thought that they weren't big enough to do anything interesting in the way of learning. In any case, I did my thesis on ideas about how the nervous system might learn.}, author = {Bernstein, Jeremy}, file = {:home/perellm1/.local/share/data/Mendeley Ltd./Mendeley Desktop/Downloaded/NIWAGMED/a-i.html:html}, journal = {The New Yorker}, keywords = {Advanced Research Projects Agency,Andrew,Artificial Intelligence Laboratory,Berliner,Bertram,Bobrow,Chess,Computer Language,Computers,Crick,Daniel,Dartmouth Summer Research on Artificial Intelligen,Dean,Digital Equipment Corp.,Edmonds,Electronic Learning Machine,Francis,Frank,Gelernter,Gleason,Hans,Harvard University,Herbert,John,MARVIN,MINSKY,Massachusetts Institute of Technology,Mathematicians,McCarthy,Microcomputers,Papert,Perceptron,Project MAC,Raphael,Robots,Rosenblatt,Seymour,artificial intelligence,mscthesis}, mendeley-tags = {Advanced Research Projects Agency,Andrew,Artificial Intelligence Laboratory,Berliner,Bertram,Bobrow,Chess,Computer Language,Computers,Crick,Daniel,Dartmouth Summer Research on Artificial Intelligen,Dean,Digital Equipment Corp.,Edmonds,Electronic Learning Machine,Francis,Frank,Gelernter,Gleason,Hans,Harvard University,Herbert,John,MARVIN,MINSKY,Massachusetts Institute of Technology,Mathematicians,McCarthy,Microcomputers,Papert,Perceptron,Project MAC,Raphael,Robots,Rosenblatt,Seymour,artificial intelligence,mscthesis}, month = dec, pages = {50}, title = {{A. I.}}, url = {http://www.newyorker.com/magazine/1981/12/14/a-i}, year = {1981} }  @inproceedings{Borji2013, abstract = {Several decades of research in computer and primate vision have resulted in many models (some specialized for one problem, others more general) and invaluable experimental data. Here, to help focus research efforts onto the hardest unsolved problems, and bridge computer and human vision, we define a battery of 5 tests that measure the gap between human and machine performances in several dimensions (generalization across scene categories, generalization from images to edge maps and line drawings, invariance to rotation and scaling, local/global information with jumbled images, and object recognition performance). We measure model accuracy and the correlation between model and human error patterns. Experimenting over 7 datasets, where human data is available, and gauging 14 well-established models, we find that none fully resembles humans in all aspects, and we learn from each test which models and features are more promising in approaching humans in the tested dimension. Across all tests, we find that models based on local edge histograms consistently resemble humans more, while several scene statistics or "gist" models do perform well with both scenes and objects. While computer vision has long been inspired by human vision, we believe systematic efforts, such as this, will help better identify shortcomings of models and find new paths forward.}, author = {Borji, Ali and Itti, Laurent}, file = {:home/perellm1/.local/share/data/Mendeley Ltd./Mendeley Desktop/Downloaded/PFA7SKFV/Borji and Itti - 2013 - Human vs. Computer in Scene and Object Recognition.pdf:pdf;:home/perellm1/.local/share/data/Mendeley Ltd./Mendeley Desktop/Downloaded/I733GRQN/Borji\_Human\_vs.\_Computer\_2014\_CVPR\_paper.html:html}, keywords = {mscthesis}, mendeley-tags = {mscthesis}, pages = {113--120}, title = {{Human vs. Computer in Scene and Object Recognition}}, url = {http://www.cv-foundation.org/openaccess/content\_cvpr\_2014/html/Borji\_Human\_vs.\_Computer\_2014\_CVPR\_paper.html http://www.cv-foundation.org/openaccess/content\_cvpr\_2014/papers/Borji\_Human\_vs.\_Computer\_2014\_CVPR\_paper.pdf}, year = {2013} }  @article{Brown2005, abstract = {This paper presents a system for fully automatic recognition and reconstruction of 3D objects in image databases. We pose the object recognition problem as one of finding consistent matches between all images, subject to the constraint that the images were taken from a perspective camera. We assume that the objects or scenes are rigid. For each image, we associate a camera matrix, which is parameterised by rotation, translation and focal length. We use invariant local features to find matches between all images, and the RANSAC algorithm to find those that are consistent with the fundamental matrix. Objects are recognised as subsets of matching images. We then solve for the structure and motion of each object, using a sparse bundle adjustment algorithm. Our results demonstrate that it is possible to recognise and reconstruct 3D objects from an unordered image database with no user input at all.}, author = {Brown, M. and Lowe, D.G.}, doi = {10.1109/3DIM.2005.81}, file = {:home/perellm1/.local/share/data/Mendeley Ltd./Mendeley Desktop/Downloaded/M5HQNR5D/Brown and Lowe - 2005 - Unsupervised 3D object recognition and reconstruct.pdf:pdf;:home/perellm1/.local/share/data/Mendeley Ltd./Mendeley Desktop/Downloaded/X3FCTMS2/abs\_all.html:html}, journal = {3-D Digital Imaging and Modeling, 2005. \ldots}, keywords = {Computer Graphics,Computer science,Computer vision,Image databases,Image recognition,Layout,RANSAC algorithm,Sparse matrices,automatic recognition,camera matrix,cameras,feature extraction,image matching,image motion analysis,image reconstruction,invariant local features,object motion,object recognition,object reconstruction,sparse bundle adjustment algorithm,unordered datasets,unsupervised 3D object recognition,visual databases}, mendeley-tags = {Computer Graphics,Computer science,Computer vision,Image databases,Image recognition,Layout,RANSAC algorithm,Sparse matrices,automatic recognition,camera matrix,cameras,feature extraction,image matching,image motion analysis,image reconstruction,invariant local features,object motion,object recognition,object reconstruction,sparse bundle adjustment algorithm,unordered datasets,unsupervised 3D object recognition,visual databases}, month = jun, pages = {56--63}, title = {{Unsupervised 3D object recognition and reconstruction in unordered datasets}}, url = {http://ieeexplore.ieee.org/ielx5/9854/31039/01443228.pdf?tp=\&arnumber=1443228\&isnumber=31039 http://ieeexplore.ieee.org/xpls/abs\_all.jsp?arnumber=1443228}, year = {2005} }  @article{Chopra2005, abstract = {We present a method for training a similarity metric from data. The method can be used for recognition or verification applications where the number of categories is very large and not known during training, and where the number of training samples for a single category is very small. The idea is to learn a function that maps input patterns into a target space such that the L1 norm in the target space approximates the "semantic" distance in the input space. The method is applied to a face verification task. The learning process minimizes a discriminative loss function that drives the similarity metric to be small for pairs of faces from the same person, and large for pairs from different persons. The mapping from raw to the target space is a convolutional network whose architecture is designed for robustness to geometric distortions. The system is tested on the Purdue/AR face database which has a very high degree of variability in the pose, lighting, expression, position, and artificial occlusions such as dark glasses and obscuring scarves.}, author = {Chopra, S. and Hadsell, R. and LeCun, Y.}, doi = {10.1109/CVPR.2005.202}, file = {:home/perellm1/.local/share/data/Mendeley Ltd./Mendeley Desktop/Downloaded/2GHSSFJV/Chopra et al. - 2005 - Learning a similarity metric discriminatively, wit.pdf:pdf;:home/perellm1/.local/share/data/Mendeley Ltd./Mendeley Desktop/Downloaded/EXAXMRBV/abs\_all.html:html}, journal = {\ldots Vision and Pattern Recognition \ldots}, keywords = {Artificial neural networks,Character generation,Drives,Glass,L1 norm,Robustness,Spatial databases,Support vector machine classification,System testing,discriminative loss function,face recognition,face verification,geometric distortion,learning (artificial intelligence),mscthesis,semantic distance approximation,similarity metric learning,support vector machines}, mendeley-tags = {Artificial neural networks,Character generation,Drives,Glass,L1 norm,Robustness,Spatial databases,Support vector machine classification,System testing,discriminative loss function,face recognition,face verification,geometric distortion,learning (artificial intelligence),mscthesis,semantic distance approximation,similarity metric learning,support vector machines}, month = jun, pages = {539--546 vol. 1}, title = {{Learning a similarity metric discriminatively, with application to face verification}}, url = {http://ieeexplore.ieee.org/ielx5/9901/31472/01467314.pdf?tp=\&arnumber=1467314\&isnumber=31472 http://ieeexplore.ieee.org/xpls/abs\_all.jsp?arnumber=1467314\&tag=1 http://ieeexplore.ieee.org/xpls/abs\_all.jsp?arnumber=1467314}, volume = {1}, year = {2005} }  @article{Cull2007, abstract = {N. Rashevsky (1899–1972) was one of the pioneers in the application of mathematics to biology. With the slogan: mathematical biophysics : biology :: mathematical physics : physics, he proposed the creation of a quantitative theoretical biology. Here, we will give a brief biography, and consider Rashevsky's contributions to mathematical biology including neural nets and relational biology. We conclude that Rashevsky was an important figure in the introduction of quantitative models and methods into biology.}, author = {Cull, Paul}, doi = {10.1016/j.biosystems.2006.11.003}, file = {:home/perellm1/.local/share/data/Mendeley Ltd./Mendeley Desktop/Downloaded/BIJD6PSV/Cull - 2007 - The mathematical biophysics of Nicolas Rashevsky.pdf:pdf}, issn = {0303-2647}, journal = {Biosystems}, keywords = {History,Mathematical biology,Mathematical biophysics,Rashevsky,Relational biology,mscthesis,neural nets}, mendeley-tags = {History,Mathematical biology,Mathematical biophysics,Rashevsky,Relational biology,mscthesis,neural nets}, month = apr, number = {3}, pages = {178--184}, series = {BIOCOMP 2005: Selected papers presented at the International Conference - Diffusion Processes in Neurobiology and Subcellular Biology BIOCOMP2006: Diffusion Processes in Neurobiology and Subcellular Biology}, title = {{The mathematical biophysics of Nicolas Rashevsky}}, url = {http://www.sciencedirect.com/science/article/pii/S0303264706002140 http://www.sciencedirect.com/science/article/pii/S0303264706002140/pdfft?md5=87c78451bbabcc4aaa6731671c574e5e\&pid=1-s2.0-S0303264706002140-main.pdf}, volume = {88}, year = {2007} }  @book{Daniilidis2010, author = {Daniilidis, Kostas and Maragos, Petros and Paragios, Nikos}, file = {:share/imagedb/perellm1/references/Daniilidis, Maragos, Paragios\_2010\_Computer Vision--ECCV 2010.pdf:pdf}, isbn = {9783642155666}, keywords = {mscthesis}, mendeley-tags = {mscthesis}, title = {{Computer Vision--ECCV 2010}}, url = {http://link.springer.com/content/pdf/10.1007/978-3-642-15552-9.pdf}, year = {2010} }  @book{Dawson2008, abstract = {"Connectionism" is a "hands on" introduction to connectionist modeling through practical exercises in different types of connectionist architectures. explores three different types of connectionist architectures - distributed associative memory, perceptron, and multilayer perceptron provides a brief overview of each architecture, a detailed introduction on how to use a program to explore this network, and a series of practical exercises that are designed to highlight the advantages, and disadvantages, of each accompanied by a website at http: //www.bcp.psych.ualberta.ca/ mike/Book3/ that includes practice exercises and software, as well as the files and blank exercise sheets required for performing the exercises designed to be used as a stand-alone volume or alongside "Minds and Machines: Connectionism and Psychological Modeling" (by Michael R.W. Dawson, Blackwell 2004)}, author = {Dawson, Michael R. W.}, file = {:share/imagedb/perellm1/references/Dawson\_2008\_Connectionism A Hands-on Approach.pdf:pdf}, isbn = {9781405143899}, keywords = {Psychology / Cognitive Psychology,Psychology / Cognitive Psychology \& Cognition,Psychology / General,mscthesis}, language = {en}, mendeley-tags = {Psychology / Cognitive Psychology,Psychology / Cognitive Psychology \& Cognition,Psychology / General,mscthesis}, month = apr, pages = {211}, publisher = {John Wiley \& Sons}, shorttitle = {Connectionism}, title = {{Connectionism: A Hands-on Approach}}, url = {http://books.google.fi/books?id=LiZvtQqNLDUC}, year = {2008} }  @article{Farabet2013, abstract = {Scene labeling consists of labeling each pixel in an image with the category of the object it belongs to. We propose a method that uses a multiscale convolutional network trained from raw pixels to extract dense feature vectors that encode regions of multiple sizes centered on each pixel. The method alleviates the need for engineered features, and produces a powerful representation that captures texture, shape, and contextual information. We report results using multiple postprocessing methods to produce the final labeling. Among those, we propose a technique to automatically retrieve, from a pool of segmentation components, an optimal set of components that best explain the scene; these components are arbitrary, for example, they can be taken from a segmentation tree or from any family of oversegmentations. The system yields record accuracies on the SIFT Flow dataset (33 classes) and the Barcelona dataset (170 classes) and near-record accuracy on Stanford background dataset (eight classes), while being an order of magnitude faster than competing approaches, producing a 320×240 image labeling in less than a second, including feature extraction.}, author = {Farabet, C and Couprie, Camille and Najman, Laurent and LeCun, Y}, file = {:home/perellm1/.local/share/data/Mendeley Ltd./Mendeley Desktop/Downloaded/Farabet et al. - 2012 - Learning hierarchical features for scene labeling.pdf:pdf}, keywords = {CNN,mscthesis}, mendeley-tags = {CNN,mscthesis}, pages = {1915--1929}, title = {{Learning hierarchical features for scene labeling}}, url = {http://ieeexplore.ieee.org/xpls/abs\_all.jsp?arnumber=6338939 http://hal.archives-ouvertes.fr/docs/00/74/20/77/PDF/farabet-pami-13.pdf}, volume = {8}, year = {2012} }  @phdthesis{Farabet2014, abstract = {One of the open questions of artificial computer vision is how to produce good internal representations of the visual world. What sort of internal representation would allow an artificial vision system to detect and classify objects into categories, independently of pose, scale, illumination, conforma- tion, and clutter? More interestingly, how could an artificial vision system learn appropriate internal representations automatically, the way animals and humans seem to learn by simply looking at the world? Another related question is that of computational tractability, and more precisely that of computational efficiency. Given a good visual represen- tation, how efficiently can it be trained, and used to encode new sensorial data. Efficiency has several dimensions: power requirements, processing speed, and memory usage. In this thesis I present three new contributions to the field of computer vision: (1) a multiscale deep convolutional network architecture to easily capture long-distance relationships between input variables in image data, (2) a tree-based algorithm to efficiently explore multiple segmentation can- didates, to produce maximally confident semantic segmentations of images, (3) a custom dataflow computer architecture optimized for the computation of convolutional networks, and similarly dense image processing models. All three contributions were produced with the common goal of getting us closer to real-time image understanding. Scene parsing consists in labeling each pixel in an image with the category of the object it belongs to. In the first part of this thesis, I propose a method that uses a multiscale convolutional network trained from raw pixels to extract dense feature vectors that encode regions of multiple sizes centered on each pixel. The method alleviates the need for engineered features. Inparallel to feature extraction, a tree of segments is computed from a graph of pixel dissimilarities. The feature vectors associated with the segments covered by each node in the tree are aggregated and fed to a classifier which produces an estimate of the distribution of object categories contained in the segment. A subset of tree nodes that cover the image are then selected so as to maximize the average “purity” of the class distributions, hence maximizing the overall likelihood that each segment contains a single object. The system yields record accuracies on several public benchmarks. The computation of convolutional networks, and related models heavily relies on a set of basic operators that are particularly fit for dedicated hardware implementations. In the second part of this thesis I introduce a scalable dataflow hardware architecture optimized for the computation of general-purpose vision algorithms—neuFlow —and a dataflow compiler— luaFlow —that transforms high-level flow-graph representations of these al- gorithms into machine code for neuFlow. This system was designed with the goal of providing real-time detection, categorization and localization of objects in complex scenes, while consuming 10 Watts when implemented on a Xilinx Virtex 6 FPGA platform, or about ten times less than a lap- top computer, and producing speedups of up to 100 times in real-world applications (results from 2011).}, author = {Farabet, Cl\'{e}ment}, file = {:home/perellm1/.local/share/data/Mendeley Ltd./Mendeley Desktop/Downloaded/Farabet - 2014 - Towards Real-Time Image Understanding with Convolutional Networks.pdf:pdf}, keywords = {CNN,mscthesis}, mendeley-tags = {CNN,mscthesis}, pages = {124}, school = {Universit\'{e} Paris-Est}, title = {{Towards Real-Time Image Understanding with Convolutional Networks}}, url = {http://hal-upec-upem.archives-ouvertes.fr/docs/00/96/56/22/PDF/thA\_se.pdf}, year = {2014} }  @article{Fay2010, abstract = {In a mathematical approach to hypothesis tests, we start with a clearly defined set of hypotheses and choose the test with the best properties for those hypotheses. In practice, we often start with less precise hypotheses. For example, often a researcher wants to know which of two groups generally has the larger responses, and either a t-test or a Wilcoxon-Mann-Whitney (WMW) test could be acceptable. Although both t-tests and WMW tests are usually associated with quite different hypotheses, the decision rule and p-value from either test could be associated with many different sets of assumptions, which we call perspectives. It is useful to have many of the different perspectives to which a decision rule may be applied collected in one place, since each perspective allows a different interpretation of the associated p-value. Here we collect many such perspectives for the two-sample t-test, the WMW test and other related tests. We discuss validity and consistency under each perspective and discuss recommendations between the tests in light of these many different perspectives. Finally, we briefly discuss a decision rule for testing genetic neutrality where knowledge of the many perspectives is vital to the proper interpretation of the decision rule.}, author = {Fay, MP and Proschan, MA}, doi = {10.1214/09-SS051}, file = {:home/perellm1/.local/share/data/Mendeley Ltd./Mendeley Desktop/Downloaded/X5WRBZBX/Fay and Proschan - 2010 - Wilcoxon-Mann-Whitney or t-test On assumptions fo.pdf:pdf}, issn = {1935-7516}, journal = {Statistics surveys}, keywords = {mscthesis}, mendeley-tags = {mscthesis}, pages = {1--39}, shorttitle = {Wilcoxon-Mann-Whitney or t-test?}, title = {{Wilcoxon-Mann-Whitney or t-test? On assumptions for hypothesis tests and multiple interpretations of decision rules}}, url = {http://www.ncbi.nlm.nih.gov/pmc/articles/PMC2857732/ http://www.ncbi.nlm.nih.gov/pmc/articles/PMC2857732/pdf/nihms-185373.pdf}, volume = {4}, year = {2010} }  @article{Girshick2014, abstract = {Object detection performance, as measured on the canonical PASCAL VOC dataset, has plateaued in the last few years. The best-performing methods are complex ensemble systems that typically combine multiple low-level image features with high-level context. In this paper, we propose a simple and scalable detection algorithm that improves mean average precision (mAP) by more than 30\% relative to the previous best result on VOC 2012---achieving a mAP of 53.3\%. Our approach combines two key insights: (1) one can apply high-capacity convolutional neural networks (CNNs) to bottom-up region proposals in order to localize and segment objects and (2) when labeled training data is scarce, supervised pre-training for an auxiliary task, followed by domain-specific fine-tuning, yields a significant performance boost. Since we combine region proposals with CNNs, we call our method R-CNN: Regions with CNN features. We also compare R-CNN to OverFeat, a recently proposed sliding-window detector based on a similar CNN architecture. We find that R-CNN outperforms OverFeat by a large margin on the 200-class ILSVRC2013 detection dataset. Source code for the complete system is available at http://www.cs.berkeley.edu/\~{}rbg/rcnn.}, annote = {$\backslash$begin\{itemize\}$\backslash$item Region proposals using CNN (R-CNN)$\backslash$item Object detection in three steps:$\backslash$begin\{description\}$\backslash$item[Region proposal] using selective search$\backslash$item[Feature extraction] using Alexnet for 4096-dimentional feature vectors in each region (Caffe implementation)$\backslash$item[Classification] using one-vs-all linear SVMs$\backslash$end\{description\}$\backslash$item Pretraining Alexnet architecture$\backslash$begin\{itemize\}$\backslash$item Changed last softmax layer from 1000 to 21 size (ILSVRC 2012 vs PASCAL)$\backslash$item Fine-tuning with PASCAL$\backslash$item mini-batch 128 (32 positive vs 96 background)$\backslash$end\{itemize\}$\backslash$item Results on PASCAL VOC 2010-12$\backslash$begin\{itemize\}$\backslash$item Compared to UVA system from Uijlings et. al. Selective search for object recognition'' Uijlings2013$\backslash$item Improved from 35.1$\backslash$\% to 53.7$\backslash$\% mAP$\backslash$end\{itemize\}$\backslash$item Analysis of pretrained Alexnet$\backslash$begin\{itemize\}$\backslash$item FC6 generalizes better than FC7$\backslash$item Good results if remove FC6 and FC7 (that only keeps 6$\backslash$\% of the parameters)$\backslash$end\{itemize\}$\backslash$item [$\backslash$ldots] classical tools from computer vision and deep learning [$\backslash$ldots] the two are natural and inevitable partners.''$\backslash$end\{itemize\}}, author = {Girshick, Ross and Donahue, Jeff and Darrell, Trevor and Malik, Jitendra}, file = {:home/perellm1/.local/share/data/Mendeley Ltd./Mendeley Desktop/Downloaded/SIHTHBKA/Girshick et al. - 2013 - Rich feature hierarchies for accurate object detec.pdf:pdf;:home/perellm1/.local/share/data/Mendeley Ltd./Mendeley Desktop/Downloaded/HN9MBJS3/1311.html:html}, journal = {IEEE Conference on Computer Vision and Pattern Recognition (CVPR)}, keywords = {Computer Science - Computer Vision and Pattern Rec,mscthesis}, mendeley-tags = {Computer Science - Computer Vision and Pattern Rec,mscthesis}, month = nov, title = {{Rich feature hierarchies for accurate object detection and semantic segmentation}}, url = {http://arxiv.org/abs/1311.2524 http://www.arxiv.org/pdf/1311.2524.pdf}, year = {2014} }  @article{Girshick2014b, abstract = {Deformable part models (DPMs) and convolutional neural networks (CNNs) are two widely used tools for visual recognition. They are typically viewed as distinct approaches: DPMs are graphical models (Markov random fields), while CNNs are "black-box" non-linear classifiers. In this paper, we show that a DPM can be formulated as a CNN, thus providing a novel synthesis of the two ideas. Our construction involves unrolling the DPM inference algorithm and mapping each step to an equivalent (and at times novel) CNN layer. From this perspective, it becomes natural to replace the standard image features used in DPM with a learned feature extractor. We call the resulting model DeepPyramid DPM and experimentally validate it on PASCAL VOC. DeepPyramid DPM significantly outperforms DPMs based on histograms of oriented gradients features (HOG) and slightly outperforms a comparable version of the recently introduced R-CNN detection system, while running an order of magnitude faster.}, annote = {$\backslash$begin\{itemize\}$\backslash$item A DPM can be expresed as a CNN$\backslash$item when using the new distance transform pooling that generalizes max pooling$\backslash$item and maxout units$\backslash$item DeepPyramid DPM takes an image pyramid and produces a pyramid of object detectors$\backslash$item Instead of using HOG uses a CNN$\backslash$begin\{itemize\}$\backslash$item from pretrained Alexnet (CNN)$\backslash$item remove fc6, fc7, fc8 and pool5$\backslash$item only interested on conv5 (256 feature channels)$\backslash$item Each pixel on conv5 feature map corresponds to 16 pixels in the original image$\backslash$end\{itemize\}$\backslash$item Create the image pyramid$\backslash$begin\{itemize\}$\backslash$item Resize image largest dimension to 1.713 pixels$\backslash$item Conv5 sees 108 cells in longest side$\backslash$item 7 pyramid levels with scale factor 1/sqrt(2)$\backslash$item total of 25k output cells per image$\backslash$item For comparison: 1k5 in OverFeat and 250k commonly with HOG$\backslash$end\{itemize\}$\backslash$item Results$\backslash$begin\{itemize\}$\backslash$item Conv5 only fires to certain scales per class$\backslash$item On the other hand, HOG in all scales$\backslash$end\{itemize\}$\backslash$end\{itemize\}}, author = {Girshick, Ross and Iandola, Forrest and Darrell, Trevor and Malik, Jitendra}, file = {:home/perellm1/.local/share/data/Mendeley Ltd./Mendeley Desktop/Downloaded/GN2C2U44/Girshick et al. - 2014 - Deformable Part Models are Convolutional Neural Ne.pdf:pdf;:home/perellm1/.local/share/data/Mendeley Ltd./Mendeley Desktop/Downloaded/37CMCFC9/1409.html:html}, journal = {arXiv preprint arXiv:1409.5403}, keywords = {Computer Science - Computer Vision and Pattern Rec,mscthesis}, mendeley-tags = {Computer Science - Computer Vision and Pattern Rec,mscthesis}, month = sep, title = {{Deformable part models are convolutional neural networks}}, url = {http://arxiv.org/abs/1409.5403 http://www.arxiv.org/pdf/1409.5403.pdf}, year = {2014} }  @article{Graves2009, abstract = {Recognizing lines of unconstrained handwritten text is a challenging task. The difficulty of segmenting cursive or overlapping characters, combined with the need to exploit surrounding context, has led to low recognition rates for even the best current recognizers. Most recent progress in the field has been made either through improved preprocessing or through advances in language modeling. Relatively little work has been done on the basic recognition algorithms. Indeed, most systems rely on the same hidden Markov models that have been used for decades in speech and handwriting recognition, despite their well-known shortcomings. This paper proposes an alternative approach based on a novel type of recurrent neural network, specifically designed for sequence labeling tasks where the data is hard to segment and contains long-range bidirectional interdependencies. In experiments on two large unconstrained handwriting databases, our approach achieves word recognition accuracies of 79.7 percent on online data and 74.1 percent on offline data, significantly outperforming a state-of-the-art HMM-based system. In addition, we demonstrate the network's robustness to lexicon size, measure the individual influence of its hidden layers, and analyze its use of context. Last, we provide an in-depth discussion of the differences between the network and HMMs, suggesting reasons for the network's superior performance.}, author = {Graves, A and Liwicki, M. and Fernandez, S. and Bertolami, R. and Bunke, H. and Schmidhuber, J.}, doi = {10.1109/TPAMI.2008.137}, file = {:home/perellm1/.local/share/data/Mendeley Ltd./Mendeley Desktop/Downloaded/2C7X9EWC/Graves et al. - 2009 - A Novel Connectionist System for Unconstrained Han.pdf:pdf;:home/perellm1/.local/share/data/Mendeley Ltd./Mendeley Desktop/Downloaded/F2IX2S8J/abs\_all.html:html}, issn = {0162-8828}, journal = {IEEE Transactions on Pattern Analysis and Machine Intelligence}, keywords = {Algorithms,Automatic Data Processing,Connectionist temporal classification,Handwriting,Image Enhancement,Image Interpretation- Computer-Assisted,Information Storage and Retrieval,Long Short-Term Memory,Models- Statistical,Offline handwriting recognition,Online handwriting recognition,Pattern Recognition- Automated,Reading,Recurrent neural networks,Reproducibility of Results,Sensitivity and Specificity,Subtraction Technique,Unconstrained handwriting recognition,bidirectional long short-term memory,connectionist system,handwriting recognition,handwritten character recognition,hidden Markov model.,hidden Markov models,image segmentation,language modeling,mscthesis,offline handwriting,online handwriting,overlapping character segmentation,recurrent neural nets,recurrent neural network,unconstrained handwriting databases,unconstrained handwriting text recognition}, mendeley-tags = {Algorithms,Automatic Data Processing,Connectionist temporal classification,Handwriting,Image Enhancement,Image Interpretation- Computer-Assisted,Information Storage and Retrieval,Long Short-Term Memory,Models- Statistical,Offline handwriting recognition,Online handwriting recognition,Pattern Recognition- Automated,Reading,Recurrent neural networks,Reproducibility of Results,Sensitivity and Specificity,Subtraction Technique,Unconstrained handwriting recognition,bidirectional long short-term memory,connectionist system,handwriting recognition,handwritten character recognition,hidden Markov model.,hidden Markov models,image segmentation,language modeling,mscthesis,offline handwriting,online handwriting,overlapping character segmentation,recurrent neural nets,recurrent neural network,unconstrained handwriting databases,unconstrained handwriting text recognition}, month = may, number = {5}, pages = {855--868}, title = {{A Novel Connectionist System for Unconstrained Handwriting Recognition}}, url = {http://ieeexplore.ieee.org/ielx5/34/4804117/04531750.pdf?tp=\&arnumber=4531750\&isnumber=4804117 http://ieeexplore.ieee.org/xpls/abs\_all.jsp?arnumber=4531750\&tag=1}, volume = {31}, year = {2009} }  @article{Hinton2006a, abstract = {We show how to use “complementary priors” to eliminate the explaining-away effects thatmake inference difficult in densely connected belief nets that have many hidden layers. Using complementary priors, we derive a fast, greedy algorithm that can learn deep, directed belief networks one layer at a time, provided the top two layers form an undirected associative memory. The fast, greedy algorithm is used to initialize a slower learning procedure that fine-tunes the weights using a contrastive version of thewake-sleep algorithm. After fine-tuning, a networkwith three hidden layers forms a very good generative model of the joint distribution of handwritten digit images and their labels. This generative model gives better digit classification than the best discriminative learning algorithms. The low-dimensional manifolds on which the digits lie are modeled by long ravines in the free-energy landscape of the top-level associative memory, and it is easy to explore these ravines by using the directed connections to displaywhat the associativememory has in mind.}, author = {Hinton, GE and Osindero, Simon and Teh, YW}, file = {:home/perellm1/.local/share/data/Mendeley Ltd./Mendeley Desktop/Downloaded/Hinton, Osindero, Teh - 2006 - A fast learning algorithm for deep belief nets.pdf:pdf}, journal = {Neural computation}, keywords = {mscthesis}, mendeley-tags = {mscthesis}, title = {{A fast learning algorithm for deep belief nets}}, url = {http://www.mitpressjournals.org/doi/abs/10.1162/neco.2006.18.7.1527 http://www.cs.toronto.edu/~hinton/absps/fastnc.pdf}, year = {2006} }  @article{Hinton2006b, abstract = {High-dimensional data can be converted to low-dimensional codes by training a multilayer neural network with a small central layer to reconstruct high-dimensional input vectors. Gradient descent can be used for fine-tuning the weights in such “autoencoder” networks, but this works well only if the initial weights are close to a good solution. We describe an effective way of initializing the weights that allows deep autoencoder networks to learn low-dimensional codes that work much better than principal components analysis as a tool to reduce the dimensionality of data.}, author = {Hinton, GE and Salakhutdinov, RR}, file = {:home/perellm1/.local/share/data/Mendeley Ltd./Mendeley Desktop/Downloaded/Hinton, Salakhutdinov - 2006 - Reducing the dimensionality of data with neural networks.pdf:pdf}, journal = {Science}, keywords = {mscthesis}, mendeley-tags = {mscthesis}, number = {July}, pages = {504--507}, title = {{Reducing the dimensionality of data with neural networks}}, url = {http://www.sciencemag.org/content/313/5786/504.short http://www.lsv.uni-saarland.de/Seminar/ML\_for\_NLP\_SS12/HinSal06.pdf}, volume = {313}, year = {2006} }  @misc{Jia2013, author = {Jia, Yangqing}, keywords = {mscthesis}, mendeley-tags = {mscthesis}, title = {{Caffe: An open source convolutional architecture for fast feature embeding.}}, url = {http://caffe.berkeleyvision.org/}, year = {2013} }  @article{Khan2013, abstract = {In this article we investigate the problem of human action recognition in static images. By action recognition we intend a class of problems which includes both action classification and action detection (i.e. simultaneous localization and classification). Bag-of-words image representations yield promising results for action classification, and deformable part models perform very well object detection. The representations for action recognition typically use only shape cues and ignore color information. Inspired by the recent success of color in image classification and object detection, we investigate the potential of color for action classification and detection in static images. We perform a comprehensive evaluation of color descriptors and fusion approaches for action recognition. Experiments were conducted on the three datasets most used for benchmarking action recognition in still images: Willow, PASCAL VOC 2010 and Stanford-40. Our experiments demonstrate that incorporating color information considerably improves recognition performance, and that a descriptor based on color names outperforms pure color descriptors. Our experiments demonstrate that late fusion of color and shape information outperforms other approaches on action recognition. Finally, we show that the different color–shape fusion approaches result in complementary information and combining them yields state-of-the-art performance for action classification.}, author = {Khan, FS and Anwer, RM}, file = {:home/perellm1/.local/share/data/Mendeley Ltd./Mendeley Desktop/Downloaded/Khan, Anwer - 2013 - Coloring Action Recognition in Still Images.pdf:pdf}, journal = {International journal of \ldots}, keywords = {color features,image representation,mscthesis}, mendeley-tags = {mscthesis}, pages = {1--18}, title = {{Coloring Action Recognition in Still Images}}, url = {http://link.springer.com/article/10.1007/s11263-013-0633-0}, year = {2013} }  @article{Kohonen1972, abstract = {A new model for associative memory, based on a correlation matrix, is suggested. In this model information is accumulated on memory elements as products of component data. Denoting a key vector by q(p), and the data associated with it by another vector x(p), the pairs (q(p), x(p)) are memorized in the form of a matrix \{see the Equation in PDF File\} where c is a constant. A randomly selected subset of the elements of Mxq can also be used for memorizing. The recalling of a particular datum x(r) is made by a transformation x(r)=Mxqq(r). This model is failure tolerant and facilitates associative search of information; these are properties that are usually assigned to holographic memories. Two classes of memories are discussed: a complete correlation matrix memory (CCMM), and randomly organized incomplete correlation matrix memories (ICMM). The data recalled from the latter are stochastic variables but the fidelity of recall is shown to have a deterministic limit if the number of memory elements grows without limits. A special case of correlation matrix memories is the auto-associative memory in which any part of the memorized information can be used as a key. The memories are selective with respect to accumulated data. The ICMM exhibits adaptive improvement under certain circumstances. It is also suggested that correlation matrix memories could be applied for the classification of data.}, author = {Kohonen, Teuvo}, doi = {10.1109/TC.1972.5008975}, issn = {0018-9340}, journal = {Computers, IEEE Transactions on}, keywords = {Associative memory,Pattern Recognition,associative net,associative recall,correlation matrix memory,mscthesis,nonholographic associative memory}, mendeley-tags = {Associative memory,Pattern Recognition,associative net,associative recall,correlation matrix memory,mscthesis,nonholographic associative memory}, month = apr, number = {4}, pages = {353--359}, title = {{Correlation matrix memories}}, url = {http://dx.doi.org/10.1109/TC.1972.5008975 http://ieeexplore.ieee.org/xpls/abs\_all.jsp?arnumber=5008975}, volume = {21}, year = {1972} }  @article{Lange1997, author = {Lange, Nicholas and Bishop, C. M. and Ripley, B. D.}, doi = {10.2307/2965437}, file = {:home/perellm1/.local/share/data/Mendeley Ltd./Mendeley Desktop/Downloaded/Lange, Bishop, Ripley - 1997 - Neural Networks for Pattern Recognition.pdf:pdf}, issn = {01621459}, journal = {Journal of the American Statistical Association}, keywords = {mscthesis}, mendeley-tags = {mscthesis}, month = dec, number = {440}, pages = {1642}, title = {{Neural Networks for Pattern Recognition.}}, url = {http://www.jstor.org/stable/2965437?origin=crossref}, volume = {92}, year = {1997} }  @article{LeCun1995, abstract = {INTRODUCTION The ability of multilayer back-propagation networks to learn complex, high-dimensional, nonlinear mappings from large collections of examples makes them obvious candidates for image recognition or speech recognition tasks (see PATTERN RECOGNITION AND NEURAL NETWORKS). In the traditional model of pattern recognition, a hand-designed feature extractor gathers relevant information from the input and eliminates irrelevant variabilities. A trainable classifier then categorizes the resulting feature vectors (or strings of symbols) into classes. In this scheme, standard, fully-connected multilayer networks can be used as classifiers. A potentially more interesting scheme is to eliminate the feature extractor, feeding the network with "raw" inputs (e.g. normalized images), and to rely on backpropagation to turn the first few layers into an appropriate feature extractor. While this can be done with an ordinary fully connected feed-forward network with some success for tasks}, author = {LeCun, Y and Bengio, Y}, file = {:home/perellm1/.local/share/data/Mendeley Ltd./Mendeley Desktop/Downloaded/LeCun, Bengio - 1995 - Convolutional networks for images, speech, and time series.pdf:pdf}, journal = {\ldots handbook of brain theory and neural networks}, keywords = {CNN,mscthesis}, mendeley-tags = {CNN,mscthesis}, pages = {1--14}, title = {{Convolutional networks for images, speech, and time series}}, url = {http://www.iro.umontreal.ca/labs/neuro/pointeurs/handbook-convo.pdf}, year = {1995} }  @article{LeCun1990, author = {LeCun, Yann and Boser, B. and Denker, JS and Henderson, D. and Howard, R.E. and Hubbard, W. and Jackel, L.D.}, file = {:home/perellm1/.local/share/data/Mendeley Ltd./Mendeley Desktop/Downloaded/LeCun et al. - 1990 - Handwritten digit recognition with a back-propagation network.pdf:pdf}, journal = {Advances in neural \ldots}, keywords = {mscthesis}, mendeley-tags = {mscthesis}, pages = {396--404}, title = {{Handwritten digit recognition with a back-propagation network}}, url = {http://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.32.5076}, year = {1990} }  @article{Leeuw2009, author = {Leeuw, J}, file = {:home/perellm1/.local/share/data/Mendeley Ltd./Mendeley Desktop/Downloaded/Leeuw - 2009 - Journal of Statistical Software.pdf:pdf}, journal = {Wiley Interdisciplinary Reviews: Computational}, keywords = {mscthesis,r,support vector machines}, mendeley-tags = {mscthesis}, number = {9}, title = {{Journal of Statistical Software}}, url = {http://onlinelibrary.wiley.com/doi/10.1002/wics.10/full}, volume = {15}, year = {2009} }  @article{Linsker1988, abstract = {The emergence of a feature-analyzing function from the development rules of simple, multilayered networks is explored. It is shown that even a single developing cell of a layered network exhibits a remarkable set of optimization properties that are closely related to issues in statistics, theoretical physics, adaptive signal processing, the formation of knowledge representation in artificial intelligence, and information theory. The network studied is based on the visual system. These results are used to infer an information-theoretic principle that can be applied to the network as a whole, rather than a single cell. The organizing principle proposed is that the network connections develop in such a way as to maximize the amount of information that is preserved when signals are transformed at each processing stage, subject to certain constraints. The operation of this principle is illustrated for some simple cases.}, author = {Linsker, Ralph}, doi = {10.1109/2.36}, file = {:home/perellm1/.local/share/data/Mendeley Ltd./Mendeley Desktop/Downloaded/ERX5DMAM/Linsker - 1988 - Self-organisation in a perceptual network.pdf:pdf;:home/perellm1/.local/share/data/Mendeley Ltd./Mendeley Desktop/Downloaded/65QFWFED/summary.html:html}, issn = {0018-9162}, journal = {Computer}, keywords = {mscthesis}, mendeley-tags = {mscthesis}, month = mar, number = {3}, pages = {105--117}, title = {{Self-organisation in a perceptual network}}, url = {http://dx.doi.org/10.1109/2.36}, volume = {21}, year = {1988} }  @article{McCulloch1943, abstract = {Because of the “all-or-none” character of nervous activity, neural events and the relations among them can be treated by means of propositional logic. It is found that the behavior of every net can be described in these terms, with the addition of more complicated logical means for nets containing circles; and that for any logical expression satisfying certain conditions, one can find a net behaving in the fashion it describes. It is shown that many particular choices among possible neurophysiological assumptions are equivalent, in the sense that for every net behaving under one assumption, there exists another net which behaves under the other and gives the same results, although perhaps not in the same time. Various applications of the calculus are discussed.}, author = {McCulloch, WS and Pitts, Walter}, doi = {10.1007/BF02478259}, file = {:share/imagedb/perellm1/references/McCulloch, Pitts\_1943\_A logical calculus of the ideas immanent in nervous activity.pdf:pdf;:share/imagedb/perellm1/references/McCulloch, Pitts\_1943\_A logical calculus of the ideas immanent in nervous activity(2).pdf:pdf}, issn = {0007-4985, 1522-9602}, journal = {The bulletin of mathematical biophysics}, keywords = {Mathematical Biology in General,mscthesis}, language = {en}, mendeley-tags = {Mathematical Biology in General,mscthesis}, month = dec, number = {4}, pages = {115--133}, title = {{A logical calculus of the ideas immanent in nervous activity}}, url = {http://link.springer.com/article/10.1007/BF02478259}, volume = {5}, year = {1943} }  @article{Nachar2008, author = {Nachar, Nadim}, file = {:share/imagedb/perellm1/references/Nachar\_2008\_The Mann-Whitney U a test for assessing whether two independent samples come from the same distribution.pdf:pdf}, journal = {Tutorials in Quantitative Methods for Psychology}, number = {1}, pages = {13--20}, title = {{The Mann-Whitney U: a test for assessing whether two independent samples come from the same distribution}}, url = {http://mail.tqmp.org/Content/vol04-1/p013/p013.pdf}, volume = {4}, year = {2008} }  @article{Nakashika, author = {Nakashika, Toru and Garcia, Christophe and Takiguchi, Tetsuya and Lyon, Insa De}, file = {:home/perellm1/.local/share/data/Mendeley Ltd./Mendeley Desktop/Downloaded/Nakashika et al. - Unknown - Local-feature-map Integration Using Convolutional Neural Networks for Music Genre Classification.pdf:pdf}, journal = {INTERSPEECH}, keywords = {mscthesis}, mendeley-tags = {mscthesis}, pages = {1--4}, title = {{Local-feature-map Integration Using Convolutional Neural Networks for Music Genre Classification}}, year = {2012} }  @article{Neumann1966, author = {Neumann, J and Burks, AW}, file = {:share/imagedb/perellm1/references/Neumann, Burks\_1966\_Theory of self-reproducing automata.pdf:pdf}, keywords = {mscthesis}, mendeley-tags = {mscthesis}, title = {{Theory of self-reproducing automata}}, url = {http://dl.acm.org/citation.cfm?id=1102024}, year = {1966} }  @article{Pascanu2014, abstract = {A central challenge to many fields of science and engineering involves minimizing non-convex error functions over continuous, high dimensional spaces. Gradient descent or quasi-Newton methods are almost ubiquitously used to perform such minimizations, and it is often thought that a main source of difficulty for the ability of these local methods to find the global minimum is the proliferation of local minima with much higher error than the global minimum. Here we argue, based on results from statistical physics, random matrix theory, and neural network theory, that a deeper and more profound difficulty originates from the proliferation of saddle points, not local minima, especially in high dimensional problems of practical interest. Such saddle points are surrounded by high error plateaus that can dramatically slow down learning, and give the illusory impression of the existence of a local minimum. Motivated by these arguments, we propose a new algorithm, the saddle-free Newton method, that can rapidly escape high dimensional saddle points, unlike gradient descent and quasi-Newton methods. We apply this algorithm to deep neural network training, and provide preliminary numerical evidence for its superior performance.}, archiveprefix = {arXiv}, arxivid = {arXiv:1405.4604v1}, author = {Pascanu, Razvan and Dauphin, YN}, eprint = {arXiv:1405.4604v1}, file = {:home/perellm1/.local/share/data/Mendeley Ltd./Mendeley Desktop/Downloaded/Pascanu, Dauphin - 2014 - On the saddle point problem for non-convex optimization.pdf:pdf}, journal = {arXiv preprint arXiv: \ldots}, keywords = {mscthesis}, mendeley-tags = {mscthesis}, pages = {1--11}, title = {{On the saddle point problem for non-convex optimization}}, url = {http://arxiv.org/abs/1405.4604}, year = {2014} }  @article{Ranzato2007, abstract = {We present an unsupervised method for learning a hierarchy of sparse feature detectors that are invariant to small shifts and distortions. The resulting feature extractor consists of multiple convolution filters, followed by a feature-pooling layer that computes the max of each filter output within adjacent windows, and a point-wise sigmoid non-linearity. A second level of larger and more invariant features is obtained by training the same algorithm on patches of features from the first level. Training a supervised classifier on these features yields 0.64\% error on MNIST, and 54\% average recognition rate on Caltech 101 with 30 training samples per category. While the resulting architecture is similar to convolutional networks, the layer-wise unsupervised training procedure alleviates the over-parameterization problems that plague purely supervised learning procedures, and yields good performance with very few labeled training samples.}, author = {Ranzato, Marc'Aurelio and Huang, Fu Jie and Boureau, Y-Lan and LeCun, Yann}, doi = {10.1109/CVPR.2007.383157}, file = {:home/perellm1/.local/share/data/Mendeley Ltd./Mendeley Desktop/Downloaded/Ranzato et al. - 2007 - Unsupervised Learning of Invariant Feature Hierarchies with Applications to Object Recognition.pdf:pdf}, isbn = {1-4244-1179-3}, journal = {2007 IEEE Conference on Computer Vision and Pattern Recognition}, keywords = {mscthesis}, mendeley-tags = {mscthesis}, month = jun, pages = {1--8}, publisher = {Ieee}, title = {{Unsupervised Learning of Invariant Feature Hierarchies with Applications to Object Recognition}}, url = {http://ieeexplore.ieee.org/lpdocs/epic03/wrapper.htm?arnumber=4270182}, year = {2007} }  @book{Ripley1996, abstract = {This 1996 book is a reliable account of the statistical framework for pattern recognition and machine learning. With unparalleled coverage and a wealth of case-studies this book gives valuable insight into both the theory and the enormously diverse applications (which can be found in remote sensing, astrophysics, engineering and medicine, for example). So that readers can develop their skills and understanding, many of the real data sets used in the book are available from the author's website: www.stats.ox.ac.uk/\~{}ripley/PRbook/. For the same reason, many examples are included to illustrate real problems in pattern recognition. Unifying principles are highlighted, and the author gives an overview of the state of the subject, making the book valuable to experienced researchers in statistics, machine learning/artificial intelligence and engineering. The clear writing style means that the book is also a superb introduction for non-specialists.}, author = {Ripley, Brian D.}, isbn = {9780521460866}, keywords = {Mathematics / Probability \& Statistics / General,mscthesis}, language = {en}, mendeley-tags = {Mathematics / Probability \& Statistics / General,mscthesis}, pages = {422}, publisher = {Cambridge University Press}, title = {{Pattern Recognition and Neural Networks}}, url = {http://books.google.fi/books?id=2SzT2p8vP1oC}, year = {1996} }  @techreport{Rosenblatt1957, address = {Buffalo, NY}, author = {Rosenblatt, Frank}, institution = {Cornell Aeronautical Laboratory}, keywords = {mscthesis}, mendeley-tags = {mscthesis}, title = {{The Perceptron, a Perceiving and Recognizing Automaton}}, year = {1957} }  @article{Russakovsky2014, abstract = {The ImageNet Large Scale Visual Recognition Challenge is a benchmark in object category classification and detection on hundreds of object categories and millions of images. The challenge has been run annually from 2010 to present, attracting participation from more than fifty institutions. This paper describes the creation of this benchmark dataset and the advances in object recognition that have been possible as a result. We discuss the challenges of collecting large-scale ground truth annotation, highlight key breakthroughs in categorical object recognition, provide detailed a analysis of the current state of the field of large-scale image classification and object detection, and compare the state-of-the-art computer vision accuracy with human accuracy. We conclude with lessons learned in the five years of the challenge, and propose future directions and improvements.}, annote = {Comment: 37 pages, 14 figures}, author = {Russakovsky, Olga and Deng, Jia and Su, Hao}, file = {:share/imagedb/perellm1/references/Russakovsky, Deng, Su\_2014\_Imagenet large scale visual recognition challenge.pdf:pdf;::;::}, journal = {arXiv preprint arXiv: \ldots}, keywords = {Computer Science - Computer Vision and Pattern Rec,I.4.8,I.5.2,mscthesis}, mendeley-tags = {Computer Science - Computer Vision and Pattern Rec,I.4.8,I.5.2,mscthesis}, month = sep, title = {{Imagenet large scale visual recognition challenge}}, url = {http://arxiv.org/abs/1409.0575 http://www.arxiv.org/pdf/1409.0575.pdf}, year = {2014} }  @book{Schmidhuber2014, abstract = {In recent years, deep artificial neural networks (including recurrent ones) have won numerous contests in pattern recognition and machine learning. This historical survey compactly summarises relevant work, much of it from the previous millennium. Shallow and deep learners are distinguished by the depth of their credit assignment paths, which are chains of possibly learnable, causal links between actions and effects. I review deep supervised learning (also recapitulating the history of backpropagation), unsupervised learning, reinforcement learning$\backslash$\& evolutionary computation, and indirect search for short programs encoding deep and large networks.}, address = {Manno-Lugano}, archiveprefix = {arXiv}, arxivid = {arXiv:1404.7828v1}, author = {Schmidhuber, J\"{u}rgen}, booktitle = {arXiv preprint arXiv:1404.7828}, eprint = {arXiv:1404.7828v1}, file = {:home/perellm1/.local/share/data/Mendeley Ltd./Mendeley Desktop/Downloaded/Schmidhuber - 2014 - Deep Learning in Neural Networks An Overview.pdf:pdf}, keywords = {mscthesis}, mendeley-tags = {mscthesis}, pages = {1--66}, title = {{Deep Learning in Neural Networks: An Overview}}, url = {http://arxiv.org/abs/1404.7828}, year = {2014} }  @article{Sejnowski1987, author = {Sejnowski, TJ and Rosenberg, CR}, file = {:share/imagedb/perellm1/references/Sejnowski, Rosenberg\_1987\_Parallel networks that learn to pronounce English text.pdf:pdf}, journal = {Complex systems}, keywords = {mscthesis}, mendeley-tags = {mscthesis}, pages = {145--168}, title = {{Parallel networks that learn to pronounce English text}}, url = {http://cs.union.edu/~rieffelj/classes/2011-12/csc320/readings/Sejnowski-speech-1987.pdf}, volume = {1}, year = {1987} }  @article{Serre2007, abstract = {We introduce a new general framework for the recognition of complex visual scenes, which is motivated by biology: We describe a hierarchical system that closely follows the organization of visual cortex and builds an increasingly complex and invariant feature representation by alternating between a template matching and a maximum pooling operation. We demonstrate the strength of the approach on a range of recognition tasks: From invariant single object recognition in clutter to multiclass categorization problems and complex scene understanding tasks that rely on the recognition of both shape-based as well as texture-based objects. Given the biological constraints that the system had to satisfy, the approach performs surprisingly well: It has the capability of learning from only a few training examples and competes with state-of-the-art systems. We also discuss the existence of a universal, redundant dictionary of features that could handle the recognition of most object categories. In addition to its relevance for computer vision, the success of this approach suggests a plausibility proof for a class of feedforward models of object recognition in cortex.}, author = {Serre, Thomas and Wolf, Lior and Bileschi, Stanley and Riesenhuber, Maximilian and Poggio, Tomaso}, doi = {10.1109/TPAMI.2007.56}, file = {:home/perellm1/.local/share/data/Mendeley Ltd./Mendeley Desktop/Downloaded/Serre et al. - 2007 - Robust object recognition with cortex-like mechanisms.pdf:pdf}, issn = {0162-8828}, journal = {IEEE transactions on pattern analysis and machine intelligence}, keywords = {Algorithms,Artificial Intelligence,Biomimetics,Biomimetics: methods,Computer Simulation,Humans,Image Enhancement,Image Enhancement: methods,Image Interpretation, Computer-Assisted,Image Interpretation, Computer-Assisted: methods,Models, Biological,Pattern Recognition, Automated,Pattern Recognition, Automated: methods,Pattern Recognition, Visual,Pattern Recognition, Visual: physiology,Reproducibility of Results,Sensitivity and Specificity,Visual Cortex,Visual Cortex: physiology,mscthesis,visual cortex}, mendeley-tags = {mscthesis,visual cortex}, month = mar, number = {3}, pages = {411--26}, pmid = {17224612}, title = {{Robust object recognition with cortex-like mechanisms.}}, url = {http://www.ncbi.nlm.nih.gov/pubmed/17224612}, volume = {29}, year = {2007} }  @article{Simonyan2013, abstract = {This paper addresses the visualisation of image classification models, learnt using deep Convolutional Networks (ConvNets). We consider two visualisation techniques, based on computing the gradient of the class score with respect to the input image. The first one generates an image, which maximises the class score [Erhan et al., 2009], thus visualising the notion of the class, captured by a ConvNet. The second technique computes a class saliency map, specific to a given image and class. We show that such maps can be employed for weakly supervised object segmentation using classification ConvNets. Finally, we establish the connection between the gradient-based ConvNet visualisation methods and deconvolutional networks [Zeiler et al., 2013].}, archiveprefix = {arXiv}, arxivid = {arXiv:1312.6034v1}, author = {Simonyan, Karen and Vedaldi, A and Zisserman, A}, eprint = {arXiv:1312.6034v1}, file = {:home/perellm1/.local/share/data/Mendeley Ltd./Mendeley Desktop/Downloaded/Simonyan, Vedaldi, Zisserman - 2013 - Deep Inside Convolutional Networks Visualising Image Classification Models and Saliency Maps.pdf:pdf}, journal = {arXiv preprint arXiv:1312.6034}, keywords = {CNN,mscthesis}, mendeley-tags = {CNN,mscthesis}, pages = {1--8}, title = {{Deep Inside Convolutional Networks: Visualising Image Classification Models and Saliency Maps}}, url = {http://arxiv.org/abs/1312.6034}, year = {2013} }  @article{Srebro2005, abstract = {We study the rank, trace-norm and max-norm as complexity measures of matrices, focusing on the problem of fitting a matrix with matrices having low complexity. We present generalization error bounds for predicting unobserved entries that are based on these measures. We also consider the possible relations between these measures. We show gaps between them, and bounds on the extent of such gaps.}, author = {Srebro, Nathan and Shraibman, Adi}, file = {:share/imagedb/perellm1/references/Srebro, Shraibman\_2005\_Rank, trace-norm and max-norm.pdf:pdf}, journal = {Learning Theory}, keywords = {mscthesis}, mendeley-tags = {mscthesis}, pages = {545--560}, title = {{Rank, trace-norm and max-norm}}, url = {http://link.springer.com/chapter/10.1007/11503415\_37}, year = {2005} }  @incollection{Stollenga2014, abstract = {Traditional convolutional neural networks (CNN) are stationary and feedforward. They neither change their parameters during evaluation nor use feedback from higher to lower layers. Real brains, however, do. So does our Deep Attention Selective Network (dasNet) architecture. DasNet's feedback structure can dynamically alter its convolutional filter sensitivities during classification. It harnesses the power of sequential processing to improve classification performance, by allowing the network to iteratively focus its internal attention on some of its convolutional filters. Feedback is trained through direct policy search in a huge million-dimensional parameter space, through scalable natural evolution strategies (SNES). On the CIFAR-10 and CIFAR-100 datasets, dasNet outperforms the previous state-of-the-art model on unaugmented datasets.}, author = {Stollenga, Marijn F and Masci, Jonathan and Gomez, Faustino and Schmidhuber, J\"{u}rgen}, editor = {Ghahramani, Z. and Welling, M. and Cortes, C. and Lawrence, N. D. and Weinberger, K. Q.}, file = {:home/perellm1/.local/share/data/Mendeley Ltd./Mendeley Desktop/Downloaded/P6VKFETS/Stollenga et al. - 2014 - Deep Networks with Internal Selective Attention th.pdf:pdf;:home/perellm1/.local/share/data/Mendeley Ltd./Mendeley Desktop/Downloaded/MSIG6DZH/5276-deep-networks-with-internal-selective-attention-through-feedback-connections.html:html}, keywords = {mscthesis}, mendeley-tags = {mscthesis}, pages = {3545--3553}, publisher = {Curran Associates, Inc.}, title = {{Deep Networks with Internal Selective Attention through Feedback Connections}}, url = {http://papers.nips.cc/paper/5276-deep-networks-with-internal-selective-attention-through-feedback-connections.pdf http://papers.nips.cc/paper/5276-deep-networks-with-internal-selective-attention-through-feedback-connections}, year = {2014} }  @article{Sun2012, abstract = {In this paper, we describe the TRECVid 2012 videoconcept detection system first developed at the NTTMedia Intelligence Laboratories in collaborationwith Dalian University of Technology. For thisyear’s task, we adopted a subspace partition basedscheme for classifier learning, with emphasis on thereduction of classifier complexity, aiming atimproving the training efficiency and boosting theclassifier performance. As the video corpus used forTRECVid evaluation is ever increasing, two practicalissues are becoming more and more challenging forbuilding concept detection systems. The first one isthe time-consuming training and testing procedures,which have taken up most of the evaluation activities,preventing the design and testing of novel algorithms.The second and the more important issue is thatwhen using whole data for classifier training, thederived separating hyperplanes would be rathercomplex and thus degrade the classificationperformance. To address these issues, we propose toadopt the “divide-and-conquer” strategy for conceptdetector construction as follows. We first partitionthe whole training feature space into multiplesub-space with a scalable clustering method, andthen build sub-classifiers on these sub-spacesseparately for each concept. The decision of a testingsample is the fusion of the results a few firedsub-classifiers. Experimental results demonstrate theefficiency and effectiveness of our proposedapproach.}, author = {Sun, Yongqing and Sudo, Kyoko and Taniguchi, Yukinobu and Li, H}, file = {:home/perellm1/.local/share/data/Mendeley Ltd./Mendeley Desktop/Downloaded/Sun et al. - 2012 - TRECVid 2012 Semantic Video Concept Detection by NTT-MD-DUT.pdf:pdf}, journal = {Proc. TRECVID 2012 \ldots}, keywords = {2012 video,concept detection system first,developed at the ntt,haojie li,in this paper,lei yi,mscthesis,we describe the trecvid,yue guan}, mendeley-tags = {mscthesis}, title = {{TRECVid 2012 Semantic Video Concept Detection by NTT-MD-DUT}}, url = {http://www-nlpir.nist.gov/projects/tvpubs/tv12.papers/ntt.pdf}, year = {2012} }  @article{Sutskever2011, author = {Sutskever, I and Martens, James and Hinton, Geoffrey}, file = {:home/perellm1/.local/share/data/Mendeley Ltd./Mendeley Desktop/Downloaded/Sutskever, Martens, Hinton - 2011 - Generating text with recurrent neural networks.pdf:pdf}, journal = {Proceedings of the \ldots}, keywords = {mscthesis}, mendeley-tags = {mscthesis}, title = {{Generating text with recurrent neural networks}}, url = {http://machinelearning.wustl.edu/mlpapers/paper\_files/ICML2011Sutskever\_524.pdf}, year = {2011} }  @book{Barto1998, abstract = {Reinforcement learning, one of the most active research areas in artificialintelligence, is a computational approach to learning whereby an agent tries to maximize the totalamount of reward it receives when interacting with a complex, uncertain environment. InReinforcement Learning, Richard Sutton and Andrew Barto provide a clear andsimple account of the key ideas and algorithms of reinforcement learning. Their discussion rangesfrom the history of the field's intellectual foundations to the most recent developments andapplications. The only necessary mathematical background is familiarity with elementary concepts ofprobability.The book is divided into three parts. Part I defines thereinforcement learning problem in terms of Markov decision processes. Part II provides basicsolution methods: dynamic programming, Monte Carlo methods, and temporal-difference learning. PartIII presents a unified view of the solution methods and incorporates artificial neural networks,eligibility traces, and planning; the two final chapters present case studies and consider thefuture of reinforcement learning.}, author = {Sutton, Richard S. and Barto, Andrew G.}, file = {:share/imagedb/perellm1/references//Sutton, Barto\_1998\_Reinforcement Learning An Introduction.pdf:pdf}, isbn = {9780262193986}, keywords = {Computers / Intelligence (AI) \& Semantics,mscthesis}, language = {en}, mendeley-tags = {Computers / Intelligence (AI) \& Semantics,mscthesis}, pages = {356}, publisher = {MIT Press}, shorttitle = {Reinforcement Learning}, title = {{Reinforcement Learning: An Introduction}}, url = {http://books.google.fi/books?id=CAFR6IBF4xYC}, year = {1998} }  @article{Taylor1956, author = {Taylor, Wilfrid K.}, journal = {Information theory 3}, keywords = {mscthesis}, mendeley-tags = {mscthesis}, pages = {314--328}, title = {{Electrical simulation of some nervous system functional activities.}}, year = {1956} }  @article{Wang2011, abstract = {Feature trajectories have shown to be efficient for representing videos. Typically, they are extracted using the KLT tracker or matching SIFT descriptors between frames. However, the quality as well as quantity of these trajectories is often not sufficient. Inspired by the recent success of dense sampling in image classification, we propose an approach to describe videos by dense trajectories. We sample dense points from each frame and track them based on displacement information from a dense optical flow field. Given a state-of-the-art optical flow algorithm, our trajectories are robust to fast irregular motions as well as shot boundaries. Additionally, dense trajectories cover the motion information in videos well. We, also, investigate how to design descriptors to encode the trajectory information. We introduce a novel descriptor based on motion boundary histograms, which is robust to camera motion. This descriptor consistently outperforms other state-of-the-art descriptors, in particular in uncontrolled realistic videos. We evaluate our video description in the context of action classification with a bag-of-features approach. Experimental results show a significant improvement over the state of the art on four datasets of varying difficulty, i.e. KTH, YouTube, Hollywood2 and UCF sports.}, author = {Wang, Heng and Klaser, A and Schmid, Cordelia and Liu, Cheng-Lin}, file = {:home/perellm1/.local/share/data/Mendeley Ltd./Mendeley Desktop/Downloaded/Wang et al. - 2011 - Action recognition by dense trajectories.pdf:pdf}, journal = {\ldots and Pattern Recognition ( \ldots}, keywords = {mscthesis}, mendeley-tags = {mscthesis}, title = {{Action recognition by dense trajectories}}, url = {http://ieeexplore.ieee.org/xpls/abs\_all.jsp?arnumber=5995407}, year = {2011} }  @book{Wiener1948, abstract = {"It appers impossible for anyone seriously interested in our civilization to ignore this book. It is a ‘must’ book forthose in every branch of science... in addition, economists, politicians, statesmen, and businessmen cannot afford to overlook cybernetics and its tremendous, even terrifying implications. "It is a beautifully written book, lucid, direct, and despite its complexity, as readable by the layman as the trained scientist." -- John B. Thurston}, author = {Wiener, Norbert}, file = {:share/imagedb/perellm1/references/Wiener\_1948\_Cybernetics or Control and Communication in the Animal and the Machine.pdf:pdf}, keywords = {mscthesis}, mendeley-tags = {mscthesis}, publisher = {The Massachusetts Institute of Technology}, title = {{Cybernetics or Control and Communication in the Animal and the Machine}}, url = {http://www.goodreads.com/work/best\_book/286191-cybernetics-or-the-control-and-communication-in-the-animal-and-the-mach http://www.goodreads.com/book/show/294941.Cybernetics http://books.google.com/books?hl=en\&lr=\&id=NnM-uISyywAC\&oi=fnd\&pg=PR7\&dq=Cybernetics+or+control+and+communication+in+the+animal+and+the+machine\&ots=xgLtfEinIp\&sig=9Piaa8rdfGhy6sixVT4CdtYkYu0}, year = {1948} }  @article{Willshaw1969, abstract = {The features of a hologram that commend it as a model of associative memory can be improved on by other devices.}, author = {Willshaw, D. J. and Buneman, O. P. and Longuet-Higgins, H. C.}, doi = {10.1038/222960a0}, file = {:home/perellm1/.local/share/data/Mendeley Ltd./Mendeley Desktop/Downloaded/AEFI28Q8/Willshaw et al. - 1969 - Non-Holographic Associative Memory.pdf:pdf;:home/perellm1/.local/share/data/Mendeley Ltd./Mendeley Desktop/Downloaded/ZX9B7IRH/222960a0.html:html}, journal = {Nature}, keywords = {mscthesis}, language = {en}, mendeley-tags = {mscthesis}, month = jun, number = {5197}, pages = {960--962}, title = {{Non-Holographic Associative Memory}}, url = {http://www.nature.com/nature/journal/v222/n5197/abs/222960a0.html\#close http://www.nature.com/nature/journal/v222/n5197/pdf/222960a0.pdf}, volume = {222}, year = {1969} }  @article{Yang2013, abstract = {We propose a novel approach to boost the performance of generic object detectors on videos by learning video-specific features using a deep neural network. The insight behind our proposed approach is that an object appearing in different frames of a video clip should share similar features, which can be learned to build better detectors. Unlike many supervised detector adaptation or detection-by-tracking methods, our method does not require any extra annotations or utilize temporal correspondence. We start with the high-confidence detections from a generic detector, then iteratively learn new video-specific features and refine the detection scores. In order to learn discriminative and compact features, we propose a new feature learning method using a deep neural network based on auto en-coders. It differs from the existing unsupervised feature learning methods in two ways: first it optimizes both discriminative and generative properties of the features simultaneously, which gives our features better discriminative ability, second, our learned features are more compact, while the unsupervised feature learning methods usually learn a redundant set of over-complete features. Extensive experimental results on person and horse detection show that significant performance improvement can be achieved with our proposed method.}, author = {Yang, Yang and Shu, Guang and Shah, Mubarak}, doi = {10.1109/CVPR.2013.216}, file = {:home/perellm1/.local/share/data/Mendeley Ltd./Mendeley Desktop/Downloaded/Yang, Shu, Shah - 2013 - Semi-supervised Learning of Feature Hierarchies for Object Detection in a Video.pdf:pdf}, isbn = {978-0-7695-4989-7}, journal = {2013 IEEE Conference on Computer Vision and Pattern Recognition}, keywords = {mscthesis,trecvid}, mendeley-tags = {mscthesis,trecvid}, month = jun, pages = {1650--1657}, publisher = {Ieee}, title = {{Semi-supervised Learning of Feature Hierarchies for Object Detection in a Video}}, url = {http://ieeexplore.ieee.org/lpdocs/epic03/wrapper.htm?arnumber=6619060 http://crcv.ucf.edu/papers/cvpr2013/CVPR2013\_Yang\_FinalVersion\_HumanDetection.pdf}, year = {2013} }  @article{Fisher2012, abstract = {We present a method for synthesizing 3D object arrangements from examples. Given a few user-provided examples, our system can synthesize a diverse set of plausible new scenes by learning from a larger scene database. We rely on three novel contributions. First, we introduce a probabilistic model for scenes based on Bayesian networks and Gaussian mixtures that can be trained from a small number of input examples. Second, we develop a clustering algorithm that groups objects occurring in a database of scenes according to their local scene neighborhoods. These contextual categories allow the synthesis process to treat a wider variety of objects as interchangeable. Third, we train our probabilistic model on a mix of user-provided examples and relevant scenes retrieved from the database. This mixed model learning process can be controlled to introduce additional variety into the synthesized scenes. We evaluate our algorithm through qualitative results and a perceptual study in which participants judged synthesized scenes to be highly plausible, as compared to hand-created scenes.}, author = {Fisher, Matthew and Ritchie, Daniel and Savva, Manolis and Funkhouser, Thomas and Hanrahan, Pat}, doi = {10.1145/2366145.2366154}, file = {:home/perellm1/.local/share/data/Mendeley Ltd./Mendeley Desktop/Downloaded/MC4WUB8K/Fisher et al. - 2012 - Example-based Synthesis of 3D Object Arrangements.pdf:pdf}, issn = {0730-0301}, journal = {ACM Trans. Graph.}, keywords = {3D scenes,automatic layout,data-driven methods,probabilistic modeling,procedural modeling}, mendeley-tags = {3D scenes,automatic layout,data-driven methods,probabilistic modeling,procedural modeling}, month = nov, number = {6}, pages = {135:1--135:11}, title = {{Example-based Synthesis of 3D Object Arrangements}}, url = {http://doi.acm.org/10.1145/2366145.2366154 http://dl.acm.org/ft\_gateway.cfm?id=2366154\&type=pdf}, volume = {31}, year = {2012} }  @article{Kalogerakis2012, abstract = {We present an approach to synthesizing shapes from complex domains, by identifying new plausible combinations of components from existing shapes. Our primary contribution is a new generative model of component-based shape structure. The model represents probabilistic relationships between properties of shape components, and relates them to learned underlying causes of structural variability within the domain. These causes are treated as latent variables, leading to a compact representation that can be effectively learned without supervision from a set of compatibly segmented shapes. We evaluate the model on a number of shape datasets with complex structural variability and demonstrate its application to amplification of shape databases and to interactive shape synthesis.}, author = {Kalogerakis, Evangelos and Chaudhuri, Siddhartha and Koller, Daphne and Koltun, Vladlen}, doi = {10.1145/2185520.2185551}, file = {:home/perellm1/.local/share/data/Mendeley Ltd./Mendeley Desktop/Downloaded/WBN8PIHV/Kalogerakis et al. - 2012 - A Probabilistic Model for Component-based Shape Sy.pdf:pdf}, issn = {0730-0301}, journal = {ACM Trans. Graph.}, keywords = {data-driven 3D modeling,machine learning,probabilistic graphical models,shape structure,shape synthesis}, mendeley-tags = {data-driven 3D modeling,machine learning,probabilistic graphical models,shape structure,shape synthesis}, month = jul, number = {4}, pages = {55:1--55:11}, title = {{A Probabilistic Model for Component-based Shape Synthesis}}, url = {http://doi.acm.org/10.1145/2185520.2185551 http://dl.acm.org/ft\_gateway.cfm?id=2185551\&type=pdf}, volume = {31}, year = {2012} }  @book{Bain1873, author = {Bain, Alexander}, file = {:share/imagedb/perellm1/references/Bain\_1873\_Mind and body. The theories of their relation.pdf:pdf}, keywords = {Psychophysiology,mscthesis}, language = {eng}, mendeley-tags = {Psychophysiology,mscthesis}, pages = {220}, publisher = {New York : D. Appleton and company}, title = {{Mind and body. The theories of their relation}}, url = {http://archive.org/details/mindbodytheories00bain}, year = {1873} }  @book{Boden2006, abstract = {The development of cognitive science is one of the most remarkable and fascinating intellectual achievements of the modern era. The quest to understand the mind is as old as recorded human thought; but the progress of modern science has offered new methods and techniques which have revolutionized this enquiry. Oxford University Press now presents a masterful history of cognitive science, told by one of its most eminent practitioners. Cognitive science is the project of understanding the mind by modeling its workings. Psychology is its heart, but it draws together various adjoining fields of research, including artificial intelligence; neuroscientific study of the brain; philosophical investigation of mind, language, logic, and understanding; computational work on logic and reasoning; linguistic research on grammar, semantics, and communication; and anthropological explorations of human similarities and differences. Each discipline, in its own way, asks what the mind is, what it does, how it works, how it developed - how it is even possible. The key distinguishing characteristic of cognitive science, Boden suggests, compared with older ways of thinking about the mind, is the notion of understanding the mind as a kind of machine. She traces the origins of cognitive science back to Descartes's revolutionary ideas, and follows the story through the eighteenth and nineteenth centuries, when the pioneers of psychology and computing appear. Then she guides the reader through the complex interlinked paths along which the study of the mind developed in the twentieth century. Cognitive science, in Boden's broad conception, covers a wide range of aspects of mind: not just 'cognition' in the sense of knowledge or reasoning, but emotion, personality, social communication, and even action. In each area of investigation, Boden introduces the key ideas and the people who developed them. No one else could tell this story as Boden can: she has been an active participant in cognitive science since the 1960s, and has known many of the key figures personally. Her narrative is written in a lively, swift-moving style, enriched by the personal touch of someone who knows the story at first hand. Her history looks forward as well as back: it is her conviction that cognitive science today--and tomorrow--cannot be properly understood without a historical perspective. Mind as Machine will be a rich resource for anyone working on the mind, in any academic discipline, who wants to know how our understanding of our mental activities and capacities has developed.}, author = {Boden, MA}, file = {:share/imagedb/perellm1/references/Boden\_2006\_Mind as machine A history of cognitive science.pdf:pdf}, keywords = {mscthesis}, mendeley-tags = {mscthesis}, title = {{Mind as machine: A history of cognitive science}}, url = {http://books.google.com/books?hl=en\&lr=\&id=yRyETy43AdQC\&oi=fnd\&pg=PR13\&dq=Mind+as+Machine+:+A+History+of+Cognitive+Science\&ots=YM7F77T\_wn\&sig=FfJrfq5CddLrSEXhHUDwKpOMvi0}, volume = {1}, year = {2006} }  @article{Copeland1999, author = {Copeland, B. Jack and Proudfoot, Diane}, file = {:share/imagedb/perellm1/references/Copeland, Proudfoot\_1999\_Alan Turing's forgotten ideas in Computer Science.pdf:pdf}, journal = {Scientific American}, keywords = {mscthesis}, mendeley-tags = {mscthesis}, pages = {99--103}, title = {{Alan Turing$\backslash$'s forgotten ideas in Computer Science}}, url = {http://www.citeulike.org/group/1662/article/1505566}, year = {1999} }  @article{Dieleman2011, abstract = {Recently the ‘Million Song Dataset’, containing audio features and metadata for one million songs, was made available. In this paper, we build a convolutional network that is then trained to perform artist recognition, genre recognition and key detection. The network is tailored to summarize the audio features over musically significant timescales. It is infeasible to train the network on all available data in a supervised fashion, so we use unsupervised pretraining to be able to harness the entire dataset: we train a convolutional deep belief network on all data, and then use the learnt parameters to initialize a convolutional multilayer perceptron with the same architecture. The MLP is then trained on a labeled subset of the data for each task. We also train the same MLP with randomly initialized weights. We find that our convolutional approach improves accuracy for the genre recognition and artist recognition tasks. Unsupervised pretraining improves convergence speed in all cases. For artist recognition it improves accuracy as well.}, author = {Dieleman, Sander and Brakel, P and Schrauwen, Benjamin}, file = {:home/perellm1/.local/share/data/Mendeley Ltd./Mendeley Desktop/Downloaded/Dieleman, Brakel, Schrauwen - 2011 - Audio-based music classification with a pretrained convolutional network.pdf:pdf}, journal = {\ldots International Society for Music \ldots}, keywords = {CNN,mscthesis}, mendeley-tags = {CNN,mscthesis}, number = {Ismir}, pages = {669--674}, title = {{Audio-based music classification with a pretrained convolutional network}}, url = {https://biblio.ugent.be/publication/1989534}, year = {2011} }  @article{Fan2014, abstract = {Face representation is a crucial step of face recognition systems. An optimal face representation should be discriminative, robust, compact, and very easy-to-implement. While numerous hand-crafted and learning-based representations have been proposed, considerable room for improvement is still present. In this paper, we present a very easy-to-implement deep learning framework for face representation. Our method bases on a new structure of deep network (called Pyramid CNN). The proposed Pyramid CNN adopts a greedy-filter-and-down-sample operation, which enables the training procedure to be very fast and computation-efficient. In addition, the structure of Pyramid CNN can naturally incorporate feature sharing across multi-scale face representations, increasing the discriminative ability of resulting representation. Our basic network is capable of achieving high recognition accuracy (85.8\% on LFW benchmark) with only 8 dimension representation. When extended to feature-sharing Pyramid CNN, our system achieves the state-of-the-art performance (97.3\%) on LFW benchmark. We also introduce a new benchmark of realistic face images on social network and validate our proposed representation has a good ability of generalization.}, annote = {$\backslash$begin\{itemize\}$\backslash$item New deep structure Pyramid CNN$\backslash$item Labeled Faces in the Wild (LFW)$\backslash$begin\{itemize\}$\backslash$item \$> 13.000\$faces$\backslash$item 1680 of the people have two or more distinct photos$\backslash$item Detected by Viola-Jones detector$\backslash$item http://vis-www.cs.umass.edu/lfw/$\backslash$end\{itemize\}$\backslash$item State-of-the-art performance on LFW benchmark (\$97.3\backslash\%\$)$\backslash$item Good face representation$\backslash$begin\{itemize\}$\backslash$item Identity-preserving: Same person pictures close in feature space$\backslash$item Abstract and Compact: from high to low dimensionality$\backslash$item Uniform and Automatic: NO hand-crafted and hard-wired parts$\backslash$end\{itemize\}$\backslash$item Pyramid CNN$\backslash$begin\{itemize\}$\backslash$item ID-preserving Representation Learning: Loss functions measures distance in output feature space$\backslash$item Convolutions and Down-sampling$\backslash$item Deeper give best results, but increases rapidly the training time$\backslash$item Each CNN own private output layer and gets the input from the previous shared layer$\backslash$item Only the output of the last level network is used for the represetnation$\backslash$item The rest of the outputs is just for training$\backslash$end\{itemize\}$\backslash$item Results$\backslash$begin\{itemize\}$\backslash$item 164 incorrect predictions$\backslash$item Some of them are incorrectly labeled$\backslash$item Others are very difficult for humans, because of the age or pose$\backslash$item On LFW benchmark achieves state-of-the-art and close to human on croped images$\backslash$end\{itemize\}$\backslash$item With ROC curve as a mesure there is an improvement of 0.07-0.12 with Baseline$\backslash$item Face recognition does not contemplate affine transformations or perspectives,$\backslash$item Can be difficult to apply in task such as ImageNet, where the object can be in any place and position$\backslash$end\{itemize\}}, archiveprefix = {arXiv}, arxivid = {arXiv:1403.2802v1}, author = {Fan, Haoqiang and Cao, Zhimin and Jiang, Yunin and Yin, Qi and Doudou, C and Doudou, Chinchilla}, eprint = {arXiv:1403.2802v1}, file = {:home/perellm1/.local/share/data/Mendeley Ltd./Mendeley Desktop/Downloaded/Fan et al. - 2014 - Learning Deep Face Representation.pdf:pdf}, journal = {arXiv preprint arXiv:1403.2802}, keywords = {mscthesis}, mendeley-tags = {mscthesis}, pages = {1--10}, title = {{Learning Deep Face Representation}}, url = {http://arxiv.org/abs/1403.2802 http://arxiv.org/pdf/1403.2802v1.pdf}, year = {2014} }  @article{Fu2003, author = {Fu, LM}, isbn = {9780070532823}, keywords = {mscthesis}, language = {en}, mendeley-tags = {mscthesis}, month = apr, pages = {484}, publisher = {McGraw-Hill Education (India) Pvt Limited}, title = {{Neural networks in computer intelligence}}, url = {http://books.google.fi/books?id=gO1HZSRkk1EC http://scholar.google.com/scholar?hl=en\&btnG=Search\&q=intitle:Neural+Networks+In+Computer+Intelligence\#0 http://scholar.google.com/scholar?hl=en\&btnG=Search\&q=intitle:Neural+networks+in+computer+intelligence\#0}, year = {2003} }  @article{Fu2014, abstract = {The rapid development of social media sharing has created a huge demand for automatic media classification and annotation techniques. Attribute learning has emerged as a promising paradigm for bridging the semantic gap and addressing data sparsity via transferring attribute knowledge in object recognition and relatively simple action classification. In this paper, we address the task of attribute learning for understanding multimedia data with sparse and incomplete labels. In particular, we focus on videos of social group activities, which are particularly challenging and topical examples of this task because of their multimodal content and complex and unstructured nature relative to the density of annotations. To solve this problem, we 1) introduce a concept of semilatent attribute space, expressing user-defined and latent attributes in a unified framework, and 2) propose a novel scalable probabilistic topic model for learning multimodal semilatent attributes, which dramatically reduces requirements for an exhaustive accurate attribute ontology and expensive annotation effort. We show that our framework is able to exploit latent attributes to outperform contemporary approaches for addressing a variety of realistic multimedia sparse data learning tasks including: multitask learning, learning with label noise, N-shot transfer learning, and importantly zero-shot learning.}, author = {Fu, Yanwei and Hospedales, Timothy M and Xiang, Tao and Gong, Shaogang}, doi = {10.1109/TPAMI.2013.128}, file = {:home/perellm1/.local/share/data/Mendeley Ltd./Mendeley Desktop/Downloaded/Fu et al. - 2014 - Learning Multi-modal Latent Attributes.pdf:pdf}, issn = {1939-3539}, journal = {IEEE transactions on pattern analysis and machine intelligence}, keywords = {mscthesis}, mendeley-tags = {mscthesis}, month = feb, number = {2}, pages = {303--16}, pmid = {24356351}, title = {{Learning Multi-modal Latent Attributes}}, url = {http://www.ncbi.nlm.nih.gov/pubmed/24840626}, volume = {36}, year = {2014} }  @book{Garson1998, abstract = {This book provides the first accessible introduction to neural network analysis as a methodological strategy for social scientists. The author details numerous studies and examples which illustrate the advantages of neural network analysis over other quantitative and modelling methods in widespread use. Methods are presented in an accessible style for readers who do not have a background in computer science. The book provides a history of neural network methods, a substantial review of the literature, detailed applications, coverage of the most common alternative models and examples of two leading software packages for neural network analysis.}, author = {Garson, G. David}, isbn = {9780857026279}, keywords = {Reference / Research,Social Science / Research,mscthesis}, language = {en}, mendeley-tags = {Reference / Research,Social Science / Research,mscthesis}, month = sep, pages = {202}, publisher = {SAGE}, shorttitle = {Neural Networks}, title = {{Neural Networks: An Introductory Guide for Social Scientists}}, url = {http://books.google.fi/books?id=LdNm\_ZmKr8YC}, year = {1998} }  @article{Goodfellow2013b, abstract = {The ICML 2013 Workshop on Challenges in Representation Learning focused on three challenges: the black box learning challenge, the facial expression recognition challenge, and the multimodal learning challenge. We describe the datasets created for these challenges and summarize the results of the competitions. We provide suggestions for organizers of future challenges and some comments on what kind of knowledge can be gained from machine learning competitions.}, archiveprefix = {arXiv}, arxivid = {arXiv:1307.0414v1}, author = {Goodfellow, IJ and Erhan, Dumitru and Carrier, PL and Courville, Aaron and Mirza, Mehdi and Hamner, Ben and Cukierski, Will and Tang, Yichuan and Thaler, David and Lee, Dong-Hyun and Zhou, Yingbo and Ramaiah, Chetan and Feng, Fangxiang and Li, Ruifan and Wang, Xiaojie and Athanasakis, Dimitris and Shawe-Taylor, John and Milakov, Maxim and Park, John and Ionescu, Radu and Popescu, Marius and Grozea, Cristian and Bergstra, James and Xie, Jingjing and Romaszko, Lukasz and Xu, Bing and Chuang, Zhang and Bengio, Yoshua}, eprint = {arXiv:1307.0414v1}, file = {:home/perellm1/.local/share/data/Mendeley Ltd./Mendeley Desktop/Downloaded/Goodfellow et al. - 2013 - Challenges in Representation Learning A report on three machine learning contests.pdf:pdf}, journal = {Neural Information \ldots}, keywords = {competition,dataset,mscthesis,representation learning}, mendeley-tags = {mscthesis}, pages = {1--8}, title = {{Challenges in Representation Learning: A report on three machine learning contests}}, url = {http://link.springer.com/chapter/10.1007/978-3-642-42051-1\_16}, year = {2013} }  @incollection{Gool1996, abstract = {The paper contributes to the viewpoint invariant recognition of planar patterns, especially labels and signs under affine deformations. By their nature, the information of such ‘eye-catchers’ is not contained in the outline or frame — they often are affinely equivalent like parallelograms and ellipses — but in the intensity content within. Moment invariants are well suited for their recognition. They need a closed bounding contour, but this is comparatively easy to provide for the simple shapes considered. On the other hand, they characterize the intensity patterns without the need for error prone feature extraction. This paper uses moments as the basic features, but extends the literature in two respects: (1) deliberate mixes of different types of moments to keep the order of the moments (and hence also the sensitivity to noise) low and yet have a sufficiently large number to safeguard discriminant power; and (2) invariance with respect to photometric changes is incorporated in order to find the simplest moment invariants that can cope with changing lighting conditions which can hardly be avoided when changing viewpoint. The paper gives complete classifications of such affine / photometric moment invariants. Experiments are described that illustrate the use of some of them.}, author = {Gool, Luc Van and Moons, Theo and Ungureanu, Dorin}, editor = {Buxton, Bernard and Cipolla, Roberto}, file = {:home/perellm1/.local/share/data/Mendeley Ltd./Mendeley Desktop/Downloaded/DSDTBPTR/Gool et al. - 1996 - Affine photometric invariants for planar intensi.pdf:pdf;:home/perellm1/.local/share/data/Mendeley Ltd./Mendeley Desktop/Downloaded/VHEDKIPF/BFb0015574.html:html}, isbn = {978-3-540-61122-6, 978-3-540-49949-7}, keywords = {Artificial Intelligence (incl. Robotics),Computer Graphics,Control Engineering,Image Processing and Computer Vision,Pattern Recognition,mscthesis}, language = {en}, mendeley-tags = {Artificial Intelligence (incl. Robotics),Computer Graphics,Control Engineering,Image Processing and Computer Vision,Pattern Recognition,mscthesis}, month = jan, pages = {642--651}, publisher = {Springer Berlin Heidelberg}, series = {Lecture Notes in Computer Science}, title = {{Affine / photometric invariants for planar intensity patterns}}, url = {http://link.springer.com/chapter/10.1007/BFb0015574 http://link.springer.com/content/pdf/10.1007/BFb0015574.pdf}, year = {1996} }  @article{Hornik1989, abstract = {This paper rigorously establishes that standard multilayer feedforward networks with as few as one hidden layer using arbitrary squashing functions are capable of approximating any Borel measurable function from one finite dimensional space to another to any desired degree of accuracy, provided sufficiently many hidden units are available. In this sense, multilayer feedforward networks are a class of universal approximators.}, author = {Hornik, Kurt and Stinchcombe, Maxwell and White, Halbert}, doi = {10.1016/0893-6080(89)90020-8}, file = {:home/perellm1/.local/share/data/Mendeley Ltd./Mendeley Desktop/Downloaded/WGDW5F55/0893608089900208.html:html}, issn = {0893-6080}, journal = {Neural networks}, keywords = {Back-propagation networks,Feedforward networks,Mapping networks,Network representation capability,Sigma-Pi networks,Squashing functions,Stone-Weierstrass Theorem,Universal approximation,mscthesis}, mendeley-tags = {Back-propagation networks,Feedforward networks,Mapping networks,Network representation capability,Sigma-Pi networks,Squashing functions,Stone-Weierstrass Theorem,Universal approximation,mscthesis}, number = {5}, pages = {359--366}, title = {{Multilayer feedforward networks are universal approximators}}, url = {http://www.sciencedirect.com/science/article/pii/0893608089900208}, volume = {2}, year = {1989} }  @article{Jaeger2004, abstract = {We present a method for learning nonlinear systems, echo state networks (ESNs). ESNs employ artificial recurrent neural networks in a way that has recently been proposed independently as a learning mechanism in biological brains. The learning method is computationally efficient and easy to use. On a benchmark task of predicting a chaotic time series, accuracy is improved by a factor of 2400 over previous techniques. The potential for engineering applications is illustrated by equalizing a communication channel, where the signal error rate is improved by two orders of magnitude.}, author = {Jaeger, Herbert and Haas, Harald}, doi = {10.1126/science.1091277}, file = {:home/perellm1/.local/share/data/Mendeley Ltd./Mendeley Desktop/Downloaded/495Q7HFI/Jaeger and Haas - 2004 - Harnessing Nonlinearity Predicting Chaotic System.pdf:pdf;:home/perellm1/.local/share/data/Mendeley Ltd./Mendeley Desktop/Downloaded/Jaeger, Haas - 2004 - Harnessing Nonlinearity Predicting Chaotic Systems and Saving Energy in Wireless Communication.html:html;:home/perellm1/.local/share/data/Mendeley Ltd./Mendeley Desktop/Downloaded/Jaeger, Haas - 2004 - Harnessing Nonlinearity Predicting Chaotic Systems and Saving Energy in Wireless Communication.pdf:pdf}, issn = {0036-8075, 1095-9203}, journal = {Science}, keywords = {mscthesis}, language = {en}, mendeley-tags = {mscthesis}, month = apr, number = {5667}, pages = {78--80}, shorttitle = {Harnessing Nonlinearity}, title = {{Harnessing Nonlinearity: Predicting Chaotic Systems and Saving Energy in Wireless Communication}}, url = {http://www.sciencemag.org/content/304/5667/78 http://www.ncbi.nlm.nih.gov/pubmed/15064413 http://www.sciencemag.org/content/304/5667/78.full.pdf http://www.sciencemag.org/content/304/5667/78.short}, volume = {304}, year = {2004} }  @article{Jia2014, abstract = {Caffe provides multimedia scientists and practitioners with a clean and modifiable framework for state-of-the-art deep learning algorithms and a collection of reference models. The framework is a BSD-licensed C++ library with Python and MATLAB bindings for training and deploying general-purpose convolutional neural networks and other deep models efficiently on commodity architectures. Caffe fits industry and internet-scale media needs by CUDA GPU computation, processing over 40 million images a day on a single K40 or Titan GPU (\$\backslash approx\$2.5 ms per image). By separating model representation from actual implementation, Caffe allows experimentation and seamless switching among platforms for ease of development and deployment from prototyping machines to cloud environments. Caffe is maintained and developed by the Berkeley Vision and Learning Center (BVLC) with the help of an active community of contributors on GitHub. It powers ongoing research projects, large-scale industrial applications, and startup prototypes in vision, speech, and multimedia.}, annote = {Comment: Tech report for the Caffe software at http://github.com/BVLC/Caffe/}, author = {Jia, Yangqing and Shelhamer, Evan and Donahue, Jeff}, file = {:home/perellm1/.local/share/data/Mendeley Ltd./Mendeley Desktop/Downloaded/VG755H2W/Jia et al. - 2014 - Caffe Convolutional Architecture for Fast Feature.pdf:pdf;:home/perellm1/.local/share/data/Mendeley Ltd./Mendeley Desktop/Downloaded/BG398UZW/1408.html:html}, journal = {Proceedings of the \ldots}, keywords = {Computer Science - Computer Vision and Pattern Rec,Computer Science - Learning,Computer Science - Neural and Evolutionary Computi,mscthesis}, mendeley-tags = {Computer Science - Computer Vision and Pattern Rec,Computer Science - Learning,Computer Science - Neural and Evolutionary Computi,mscthesis}, month = jun, shorttitle = {Caffe}, title = {{Caffe: Convolutional architecture for fast feature embedding}}, url = {http://arxiv.org/abs/1408.5093 http://www.arxiv.org/pdf/1408.5093.pdf http://dl.acm.org/citation.cfm?id=2654889}, year = {2014} }  @article{Kim2007, abstract = {In this paper, a human action recognition method using a hybrid neural network is presented. The method consists of three stages: preprocessing, feature extraction, and pattern classification. For feature extraction, we propose a modified convolutional neural network (CNN) which has a three-dimensional receptive field. The CNN generates a set of feature maps from the action descriptors which are derived from a spatiotemporal volume. A weighted fuzzy min-max (WFMM) neural network is used for the pattern classification stage. We introduce a feature selection technique using the WFMM model to reduce the dimensionality of the feature space. Two kinds of relevance factors between features and pattern classes are defined to analyze the salient features.}, author = {Kim, HJ and Lee, JS and Yang, HS}, editor = {Liu, Derong and Fei, Shumin and Hou, Zengguang and Zhang, Huaguang and Sun, Changyin}, file = {:home/perellm1/.local/share/data/Mendeley Ltd./Mendeley Desktop/Downloaded/28Q637I7/Kim et al. - 2007 - Human Action Recognition Using a Modified Convolut.pdf:pdf;:home/perellm1/.local/share/data/Mendeley Ltd./Mendeley Desktop/Downloaded/UPJ6QDN3/978-3-540-72393-6\_85.html:html}, isbn = {978-3-540-72392-9, 978-3-540-72393-6}, journal = {Advances in Neural Networks-ISNN 2007}, keywords = {Algorithm Analysis and Problem Complexity,Artificial Intelligence (incl. Robotics),Computation by Abstract Devices,Computer Communication Networks,Discrete Mathematics in Computer Science,Pattern Recognition,mscthesis}, language = {en}, mendeley-tags = {Algorithm Analysis and Problem Complexity,Artificial Intelligence (incl. Robotics),Computation by Abstract Devices,Computer Communication Networks,Discrete Mathematics in Computer Science,Pattern Recognition,mscthesis}, month = jan, pages = {715--723}, publisher = {Springer Berlin Heidelberg}, series = {Lecture Notes in Computer Science}, title = {{Human action recognition using a modified convolutional neural network}}, url = {http://link.springer.com/chapter/10.1007/978-3-540-72393-6\_85 http://link.springer.com/content/pdf/10.1007\%2F978-3-540-72393-6\_85.pdf}, year = {2007} }  @article{Laptev2008, abstract = {The aim of this paper is to address recognition of natural human actions in diverse and realistic video settings. This challenging but important subject has mostly been ignored in the past due to several problems one of which is the lack of realistic and annotated video datasets. Our first contribution is to address this limitation and to investigate the use of movie scripts for automatic annotation of human actions in videos. We evaluate alternative methods for action retrieval from scripts and show benefits of a text-based classifier. Using the retrieved action samples for visual learning, we next turn to the problem of action classification in video. We present a new method for video classification that builds upon and extends several recent ideas including local space-time features, space-time pyramids and multi-channel non-linear SVMs. The method is shown to improve state-of-the-art results on the standard KTH action dataset by achieving 91.8\% accuracy. Given the inherent problem of noisy labels in automatic annotation, we particularly investigate and show high tolerance of our method to annotation errors in the training set. We finally apply the method to learning and classifying challenging action classes in movies and show promising results.}, author = {Laptev, Ivan and Marszalek, M}, file = {:home/perellm1/.local/share/data/Mendeley Ltd./Mendeley Desktop/Downloaded/Laptev, Marszalek - 2008 - Learning realistic human actions from movies.pdf:pdf}, journal = {Computer Vision and \ldots}, keywords = {mscthesis}, mendeley-tags = {mscthesis}, title = {{Learning realistic human actions from movies}}, url = {http://ieeexplore.ieee.org/xpls/abs\_all.jsp?arnumber=4587756}, year = {2008} }  @article{Larochelle2007, abstract = {Recently, several learning algorithms relying on models with deep architectures have been proposed. Though they have demonstrated impressive performance, to date, they have only been evaluated on relatively simple problems such as digit recognition in a controlled environment, for which many machine learning algorithms already report reasonable results. Here, we present a series of experiments which indicate that these models show promise in solving harder learning problems that exhibit many factors of variation. These models are compared with well-established algorithms such as Support Vector Machines and single hidden-layer feed-forward neural networks.}, author = {Larochelle, Hugo and Erhan, D and Courville, Aaron and Bergstra, James and Bengio, Yoshua}, file = {:home/perellm1/.local/share/data/Mendeley Ltd./Mendeley Desktop/Downloaded/Larochelle et al. - 2007 - An empirical evaluation of deep architectures on problems with many factors of variation.pdf:pdf}, journal = {Proceedings of the 24th \ldots}, keywords = {mscthesis}, mendeley-tags = {mscthesis}, number = {2006}, pages = {8}, title = {{An empirical evaluation of deep architectures on problems with many factors of variation}}, url = {http://dl.acm.org/citation.cfm?id=1273556}, year = {2007} }  @article{Le2010, abstract = {Convolutional neural networks (CNNs) have been successfully applied to many tasks such as digit and object recognition. Using convolutional (tied) weights significantly reduces the number of parameters that have to be learned, and also allows translational invariance to be hard-coded into the architecture. In this paper, we consider the problem of learning invariances, rather than relying on hardcoding. We propose tiled convolution neural networks (Tiled CNNs), which use a regular “tiled ” pattern of tied weights that does not require that adjacent hidden units share identical weights, but instead requires only that hidden units k steps away from each other to have tied weights. By pooling over neighboring units, this architecture is able to learn complex invariances (such as scale and rotational invariance) beyond translational invariance. Further, it also enjoys much of CNNs’ advantage of having a relatively small number of learned parameters (such as ease of learning and greater scalability). We provide an efficient learning algorithm for Tiled CNNs based on Topographic ICA, and show that learning complex invariant features allows us to achieve highly competitive results for both the NORB and CIFAR-10 datasets.}, author = {Le, QV and Ngiam, Jiquan and Chen, Zhenghao and hao Chia, DJ and Koh, PW}, file = {:home/perellm1/.local/share/data/Mendeley Ltd./Mendeley Desktop/Downloaded/Le et al. - 2010 - Tiled convolutional neural networks.pdf:pdf}, journal = {NIPS}, keywords = {mscthesis}, mendeley-tags = {mscthesis}, pages = {1--9}, title = {{Tiled convolutional neural networks.}}, url = {https://papers.nips.cc/paper/4136-tiled-convolutional-neural-networks.pdf}, year = {2010} }  @article{min2014, abstract = {We propose a novel deep network structure called "Network In Network" (NIN) to enhance model discriminability for local patches within the receptive field. The conventional convolutional layer uses linear filters followed by a nonlinear activation function to scan the input. Instead, we build micro neural networks with more complex structures to abstract the data within the receptive field. We instantiate the micro neural network with a multilayer perceptron, which is a potent function approximator. The feature maps are obtained by sliding the micro networks over the input in a similar manner as CNN; they are then fed into the next layer. Deep NIN can be implemented by stacking mutiple of the above described structure. With enhanced local modeling via the micro network, we are able to utilize global average pooling over feature maps in the classification layer, which is easier to interpret and less prone to overfitting than traditional fully connected layers. We demonstrated the state-of-the-art classification performances with NIN on CIFAR-10 and CIFAR-100, and reasonable performances on SVHN and MNIST datasets.}, archiveprefix = {arXiv}, arxivid = {1312.4400}, author = {Lin, Min and Chen, Qiang and Yan, Shuicheng}, eprint = {1312.4400}, file = {:share/imagedb/perellm1/references/Lin, Chen, Yan\_2013\_Network In Network.pdf:pdf}, keywords = {mscthesis}, mendeley-tags = {mscthesis}, month = dec, pages = {10}, title = {{Network In Network}}, url = {http://arxiv.org/abs/1312.4400}, year = {2013} }  @article{Matas2004, abstract = {The wide-baseline stereo problem, i.e. the problem of establishing correspondences between a pair of images taken from different viewpoints is studied. A new set of image elements that are put into correspondence, the so called extremal regions, is introduced. Extremal regions possess highly desirable properties: the set is closed under (1) continuous (and thus projective) transformation of image coordinates and (2) monotonic transformation of image intensities. An efficient (near linear complexity) and practically fast detection algorithm (near frame rate) is presented for an affinely invariant stable subset of extremal regions, the maximally stable extremal regions (MSER). A new robust similarity measure for establishing tentative correspondences is proposed. The robustness ensures that invariants from multiple measurement regions (regions obtained by invariant constructions from extremal regions), some that are significantly larger (and hence discriminative) than the MSERs, may be used to establish tentative correspondences. The high utility of MSERs, multiple measurement regions and the robust metric is demonstrated in wide-baseline experiments on image pairs from both indoor and outdoor scenes. Significant change of scale (3.5×), illumination conditions, out-of-plane rotation, occlusion, locally anisotropic scale change and 3D translation of the viewpoint are all present in the test problems. Good estimates of epipolar geometry (average distance from corresponding points to the epipolar line below 0.09 of the inter-pixel distance) are obtained.}, author = {Matas, J and Chum, O and Urban, M and Pajdla, T}, doi = {10.1016/j.imavis.2004.02.006}, file = {:home/perellm1/.local/share/data/Mendeley Ltd./Mendeley Desktop/Downloaded/WX4M8ZRN/Matas et al. - 2004 - Robust wide-baseline stereo from maximally stable .pdf:pdf;:home/perellm1/.local/share/data/Mendeley Ltd./Mendeley Desktop/Downloaded/FHIK7RQM/S0262885604000435.html:html}, issn = {0262-8856}, journal = {Image and Vision Computing}, keywords = {Distinguished regions,MSER,Maximally stable extremal regions,Robust metric,Wide-baseline stereo,mscthesis}, mendeley-tags = {Distinguished regions,MSER,Maximally stable extremal regions,Robust metric,Wide-baseline stereo,mscthesis}, month = sep, number = {10}, pages = {761--767}, series = {British Machine Vision Computing 2002}, title = {{Robust wide-baseline stereo from maximally stable extremal regions}}, url = {http://www.sciencedirect.com/science/article/pii/S0262885604000435 http://www.sciencedirect.com/science/article/pii/S0262885604000435/pdf?md5=c7d50066c02cceeebfc4e721ea8d341c\&pid=1-s2.0-S0262885604000435-main.pdf}, volume = {22}, year = {2004} }  @article{Metropolis1953, abstract = {A general method, suitable for fast computing machines, for investigating such properties as equations of state for substances consisting of interacting individual molecules is described. The method consists of a modified Monte Carlo integration over configuration space. Results for the two‐dimensional rigid‐sphere system have been obtained on the Los Alamos MANIAC and are presented here. These results are compared to the free volume equation of state and to a four‐term virial coefficient expansion.}, author = {Metropolis, Nicholas and Rosenbluth, Arianna W. and Rosenbluth, Marshall N. and Teller, Augusta H. and Teller, Edward}, doi = {10.1063/1.1699114}, file = {:home/perellm1/.local/share/data/Mendeley Ltd./Mendeley Desktop/Downloaded/Metropolis et al. - 1953 - Equation of State Calculations by Fast Computing Machines.pdf:pdf;:home/perellm1/.local/share/data/Mendeley Ltd./Mendeley Desktop/Downloaded/Metropolis et al. - 1953 - Equation of State Calculations by Fast Computing Machines(2).pdf:pdf;:home/perellm1/.local/share/data/Mendeley Ltd./Mendeley Desktop/Downloaded/Metropolis et al. - 1953 - Equation of State Calculations by Fast Computing Machines.html:html}, issn = {0021-9606, 1089-7690}, journal = {The Journal of Chemical Physics}, keywords = {Atomic and molecular interactions,Equations of state,Monte Carlo methods,mscthesis}, mendeley-tags = {Atomic and molecular interactions,Equations of state,Monte Carlo methods,mscthesis}, month = jun, number = {6}, pages = {1087--1092}, title = {{Equation of State Calculations by Fast Computing Machines}}, url = {http://scitation.aip.org/content/aip/journal/jcp/21/6/10.1063/1.1699114 http://scitation.aip.org/deliver/fulltext/aip/journal/jcp/21/6/1.1699114.pdf;jsessionid=3q4iihe35gtci.x-aip-live-03?itemId=/content/aip/journal/jcp/21/6/10.1063/1.1699114\&mimeType=pdf\&containerItemId=content/aip/journal/jcp}, volume = {21}, year = {1953} }  @techreport{Neumann1945, author = {von Neumann, John}, booktitle = {IEEE Annals of the History of Computing}, file = {:share/imagedb/perellm1/references/Neumann\_1945\_First Draft of a Report on the EDVAC.pdf:pdf}, keywords = {mscthesis}, mendeley-tags = {mscthesis}, number = {1}, title = {{First Draft of a Report on the EDVAC}}, url = {http://www.computer.org/csdl/mags/an/1993/04/man1993040027.pdf}, volume = {15}, year = {1945} }  @article{Novikoff1962, author = {Novikoff, A.B.J.}, journal = {Proceedings of the Symposium on the Mathematical Theory of Automata, New York}, keywords = {mscthesis}, mendeley-tags = {mscthesis}, pages = {615--622}, title = {{On convergence proofs on perceptrons}}, volume = {XII}, year = {1962} }  @article{Ripley1994a, abstract = {Feed-forward neural networks are now widely used in classification problems, whereas nonlinear methods of discrimination developed in the statistical field are much less widely known. A general framework for classification is set up within which methods from statistics, neural networks, pattern recognition and machine learning can be compared. Neural networks emerge as one of a class of flexible non-linear regression methods which can be used to classify via regression. Many interesting issues remain, including parameter estimation, the assessment of the classifiers and in algorithm development.}, author = {Ripley, B. D.}, file = {:home/perellm1/.local/share/data/Mendeley Ltd./Mendeley Desktop/Downloaded/IMDP979E/Ripley - 1994 - Neural Networks and Related Methods for Classifica.pdf:pdf}, issn = {0035-9246}, journal = {Journal of the Royal Statistical Society. Series B (Methodological)}, keywords = {mscthesis}, mendeley-tags = {mscthesis}, month = jan, number = {3}, pages = {409--456}, title = {{Neural Networks and Related Methods for Classification}}, url = {http://www.jstor.org/stable/2346118 http://www.jstor.org/stable/pdfplus/2346118.pdf?acceptTC=true}, volume = {56}, year = {1994} }  @book{Szeliski2010, abstract = {Humans perceive the three-dimensional structure of the world with apparent ease. However, despite all of the recent advances in computer vision research, the dream of having a computer interpret an image at the same level as a ...}, author = {Szeliski, R}, file = {:share/imagedb/perellm1/references/Szeliski\_2010\_Computer vision algorithms and applications.pdf:pdf}, keywords = {Computer Vision - Algorithms and Applications,Image Processing and Computer Vision,mscthesis}, mendeley-tags = {Computer Vision - Algorithms and Applications,Image Processing and Computer Vision,mscthesis}, title = {{Computer vision: algorithms and applications}}, url = {http://www.springer.com/computer/image+processing/book/978-1-84882-934-3 http://books.google.com/books?hl=en\&lr=\&id=bXzAlkODwa8C\&oi=fnd\&pg=PR9\&dq=Computer+Vision+-+Algorithms+and+Applications\&ots=gZXb64pzIJ\&sig=PUgX4ne\_FzulafmKBgw\_2UjcC00}, year = {2010} }  @article{Tompson2014, abstract = {Recent state-of-the-art performance on human-body pose estimation has been achieved with Deep Convolutional Networks (ConvNets). Traditional ConvNet architectures include pooling layers which reduce computational requirements, introduce invariance and prevent over-training. These benefits of pooling come at the cost of reduced localization accuracy. We introduce a novel architecture which includes an efficient 'position refinement' model that is trained to estimate the joint offset location within a small region of the image. This refinement model is jointly trained in cascade with a state-of-the-art ConvNet model to achieve improved accuracy in human joint location estimation. We show that the variance of our detector approaches the variance of human annotations on the FLIC dataset and outperforms all existing approaches on the MPII-human-pose dataset.}, annote = {Comment: 8 pages with 1 page of citations}, author = {Tompson, Jonathan and Goroshin, Ross and Jain, Arjun}, file = {:home/perellm1/.local/share/data/Mendeley Ltd./Mendeley Desktop/Downloaded/6J6RQBA7/Tompson et al. - 2014 - Efficient Object Localization Using Convolutional .pdf:pdf;:home/perellm1/.local/share/data/Mendeley Ltd./Mendeley Desktop/Downloaded/Q6HIWGNZ/1411.html:html}, journal = {arXiv preprint arXiv: \ldots}, keywords = {Computer Science - Computer Vision and Pattern Rec,mscthesis}, mendeley-tags = {Computer Science - Computer Vision and Pattern Rec,mscthesis}, month = nov, title = {{Efficient Object Localization Using Convolutional Networks}}, url = {http://arxiv.org/abs/1411.4280 http://www.arxiv.org/pdf/1411.4280.pdf}, year = {2014} }  @article{Topics2013, author = {Topics, Advanced and Computational, I N}, file = {:home/perellm1/.local/share/data/Mendeley Ltd./Mendeley Desktop/Downloaded/Topics, Computational - 2013 - Comparison of Artificial Neural Networks and training an Extreme Learning Machine.pdf:pdf}, keywords = {artificial neural network,extreme learning machine,mscthesis}, mendeley-tags = {mscthesis}, number = {April}, pages = {1--3}, title = {{Comparison of Artificial Neural Networks ; and training an Extreme Learning Machine}}, year = {2013} }  @article{Uttley1956a, author = {Uttley, A. M}, journal = {Automata studies}, keywords = {mscthesis}, mendeley-tags = {mscthesis}, pages = {253--276}, title = {{Conditional probability machines and conditional reflexes}}, year = {1956} }  @article{Wilkes1997, abstract = {In his book Mind and body (1873), Bain set out an account in which he related the processes of associative memory to the distribution of activity in neural groupings--or neural networks as they are now termed. In the course of this account, Bain anticipated certain aspects of connectionist ideas that are normally attributed to 20th-century authors--most notably Hebb (1949). In this paper we reproduce Bain's arguments relating neural activity to the workings of associative memory which include an early version of the principles enshrined in Hebb's neurophysiological postulate. Nonetheless, despite their prescience, these specific contributions to the connectionist case have been almost entirely ignored. Eventually, Bain came to doubt the practicality of his own arguments and, in so doing, he seems to have ensured that his ideas concerning neural groupings exerted little or no influence on the subsequent course of theorizing in this area.}, author = {Wilkes, A. L. and Wade, N. J.}, doi = {10.1006/brcg.1997.0869}, file = {:share/imagedb/perellm1/references/Wilkes, Wade\_1997\_Bain on neural networks.pdf:pdf}, issn = {0278-2626}, journal = {Brain and Cognition}, keywords = {Brain,History- 19th Century,History- 20th Century,Humans,Memory,Models- Neurological,Nerve Net,Neuropsychology,mscthesis}, language = {eng}, mendeley-tags = {Brain,History- 19th Century,History- 20th Century,Humans,Memory,Models- Neurological,Nerve Net,Neuropsychology,mscthesis}, month = apr, number = {3}, pages = {295--305}, title = {{Bain on neural networks}}, url = {http://www.ncbi.nlm.nih.gov/pubmed/9126397}, volume = {33}, year = {1997} }  @incollection{Mansinghka2013, author = {Mansinghka, Vikash and Kulkarni, Tejas D and Perov, Yura N and Tenenbaum, Josh}, editor = {Burges, C. J. C. and Bottou, L. and Welling, M. and Ghahramani, Z. and Weinberger, K. Q.}, file = {:home/perellm1/.local/share/data/Mendeley Ltd./Mendeley Desktop/Downloaded/JU8SVF65/Mansinghka et al. - 2013 - Approximate Bayesian Image Interpretation using Ge.pdf:pdf;:home/perellm1/.local/share/data/Mendeley Ltd./Mendeley Desktop/Downloaded/ZDENPGZM/4881-approximate-bayesian-image-interpretation-using-generative-probabilistic.html:html}, pages = {1520--1528}, publisher = {Curran Associates, Inc.}, title = {{Approximate Bayesian Image Interpretation using Generative Probabilistic Graphics Programs}}, url = {http://papers.nips.cc/paper/4881-approximate-bayesian-image-interpretation-using-generative-probabilistic-graphics-programs.pdf http://papers.nips.cc/paper/4881-approximate-bayesian-image-interpretation-using-generative-probabilistic http://papers.nips.cc/paper/4881-approximate-bayesian-image-interpretation-using-generative-probabilistic.pdf}, year = {2013} }  @article{Amari1967, abstract = {This paper describes error-correction adjustment procedures for determining the weight vector of linear pattern classifiers under general pattern distribution. It is mainly aimed at clarifying theoretically the performance of adaptive pattern classifiers. In the case where the loss depends on the distance between a pattern vector and a decision boundary and where the average risk function is unimodal, it is proved that, by the procedures proposed here, the weight vector converges to the optimal one even under nonseparable pattern distributions. The speed and the accuracy of convergence are analyzed, and it is shown that there is an important tradeoff between speed and accuracy of convergence. Dynamical behaviors, when the probability distributions of patterns are changing, are also shown. The theory is generalized and made applicable to the case with general discriminant functions, including piecewise-linear discriminant functions.}, author = {Amari, Shunichi}, doi = {10.1109/PGEC.1967.264666}, file = {:home/perellm1/.local/share/data/Mendeley Ltd./Mendeley Desktop/Downloaded/QVFUWMM8/Amari - 1967 - A Theory of Adaptive Pattern Classifiers.pdf:pdf;:home/perellm1/.local/share/data/Mendeley Ltd./Mendeley Desktop/Downloaded/GQRGAH3X/abs\_all.html:html}, issn = {0367-7508}, journal = {IEEE Transactions on Electronic Computers}, keywords = {Accuracy of learning,Computer errors,Convergence,Logic,Piecewise linear techniques,Probability distribution,Vectors,adaptive pattern classifier,adaptive systems,convergence of learning,learning under nonseparable pattern distribution,linear decision function,mscthesis,piecewise-linear decision function,rapidity of learning}, mendeley-tags = {Accuracy of learning,Computer errors,Convergence,Logic,Piecewise linear techniques,Probability distribution,Vectors,adaptive pattern classifier,adaptive systems,convergence of learning,learning under nonseparable pattern distribution,linear decision function,mscthesis,piecewise-linear decision function,rapidity of learning}, month = jun, number = {3}, pages = {299--307}, title = {{A Theory of Adaptive Pattern Classifiers}}, url = {http://ieeexplore.ieee.org/ielx5/4037753/4039060/04039068.pdf?tp=\&arnumber=4039068\&isnumber=4039060 http://ieeexplore.ieee.org/xpls/abs\_all.jsp?arnumber=4039068\&tag=1}, volume = {EC-16}, year = {1967} }  @book{Anderson1988, address = {Cambridge, MA, USA}, author = {Anderson, James A. and Rosenfeld, Edward}, editor = {Anderson, James A. and Rosenfeld, Edward}, isbn = {0-262-01097-6}, keywords = {mscthesis}, mendeley-tags = {mscthesis}, publisher = {MIT Press}, shorttitle = {Neurocomputing}, title = {{Neurocomputing: foundations of research}}, url = {http://scholar.google.com/scholar?hl=en\&btnG=Search\&q=intitle:Neurocomputing:+Foundations+of+Research\#3}, year = {1988} }  @article{Arandjelovic2012, abstract = {The AXES project participated in the interactive instance search task (INS), the known-item search task (KIS), and the multimedia event detection task (MED) for TRECVid 2012. As in our TRECVid 2011 system, we used nearly identical search systems and user interfaces for both INS and KIS. Our interactive INS and KIS systems focused this year on using classifiers trained at query time with positive examples collected from external search engines. Participants in our KIS experiments were media professionals from the BBC; our INS experiments were carried out by students and researchers at Dublin City University. We performed comparatively well in both experiments. Our best KIS run found 13 of the 25 topics, and our best INS runs outperformed all other submitted runs in terms of P@100. For MED, the system presented was based on a minimal number of low-level descriptors, which we chose to be as large as computationally feasible. These descriptors are aggregated to produce high-dimensional video-level signatures, which are used to train a set of linear classifiers. Our MED system achieved the second-best score of all submitted runs in the main track, and best score in the ad-hoc track, suggesting that a simple system based on state-of-the-art low-level descriptors can give relatively high performance. This paper describes in detail our KIS, INS, and MED systems and the results and findings of our experiments.}, author = {Arandjelovic, R and Zisserman, Andrew and Fernando, Basura}, file = {:home/perellm1/.local/share/data/Mendeley Ltd./Mendeley Desktop/Downloaded/Arandjelovic, Zisserman, Fernando - 2012 - AXES at TRECVid 2012 KIS, INS, and MED.pdf:pdf}, keywords = {mscthesis}, mendeley-tags = {mscthesis}, title = {{AXES at TRECVid 2012: KIS, INS, and MED}}, url = {http://hal.inria.fr/hal-00746874/PDF/oneata12tv.pdf}, year = {2012} }  @article{Araujo, abstract = {Video search has become a very important tool, with the ever-growing size of multimedia collections. This work introduces our Video Semantic Indexing system. Our experiments show that Residual Vectors provide an efficient way of aggregat- ing local descriptors, with complementary gain with respect to BoVW. Also, we show that systems using a limited number of descriptors and machine learning techniques can still be quite effective. Our first participation at the TRECVID evaluation has been very fruitful: our team was ranked 6th in the light version of the Semantic Indexing task.}, author = {Araujo, A F De and Silveira, F and Lakshman, H and Zepeda, J and Sheth, A and Girod, B}, file = {:home/perellm1/.local/share/data/Mendeley Ltd./Mendeley Desktop/Downloaded/Araujo et al. - 2012 - The Stanford Technicolor Fraunhofer HHI Video.pdf:pdf}, keywords = {bovw,centrist,dense ex-,for each run,harlap keypoint detector,l a stanford1 1,mscthesis,oppsift,residual,sift,spm,traction,trecvid}, mendeley-tags = {mscthesis,trecvid}, title = {{The Stanford / Technicolor / Fraunhofer HHI Video}}, year = {2012} }  @book{Bishop2006, author = {Bishop, Christopher M.}, file = {:share/imagedb/perellm1/references/Bishop\_2006\_Pattern recognition and machine learning(2).pdf:pdf}, keywords = {mscthesis}, mendeley-tags = {mscthesis}, publisher = {New York: springer, 2006.}, title = {{Pattern recognition and machine learning.}}, volume = {1}, year = {2006} }  @inproceedings{Chai2000, abstract = {This paper addresses an image classification technique that uses the Bayes decision rule for minimum cost to classify pixels into skin color and non-skin color. Color statistics are collected from YCbCr color space. The Bayesian approach to skin color classification is discussed along with an overview of YCbCr color space. Experimental results demonstrate that this approach can achieve good classification outcomes, and it is robust against different skin colors}, author = {Chai, D. and Bouzerdoum, A.}, doi = {10.1109/TENCON.2000.888774}, file = {:home/perellm1/.local/share/data/Mendeley Ltd./Mendeley Desktop/Downloaded/Z6JUI44P/Chai and Bouzerdoum - 2000 - A Bayesian approach to skin color classification i.pdf:pdf;:home/perellm1/.local/share/data/Mendeley Ltd./Mendeley Desktop/Downloaded/R6EWJQV4/abs\_all.html:html}, keywords = {Australia,Bayes decision rule,Bayes methods,Bayesian approach,Bayesian methods,Chromium,Costs,Mathematics,Pixel,Skin,Statistics,TV,YCbCr color space,color statistics,image classification,image colour analysis,minimum cost,mscthesis,nonskin color,pixels classification,skin color classification}, mendeley-tags = {Australia,Bayes decision rule,Bayes methods,Bayesian approach,Bayesian methods,Chromium,Costs,Mathematics,Pixel,Skin,Statistics,TV,YCbCr color space,color statistics,image classification,image colour analysis,minimum cost,mscthesis,nonskin color,pixels classification,skin color classification}, pages = {421--424 vol.2}, title = {{A Bayesian approach to skin color classification in YCbCr color space}}, url = {http://ieeexplore.ieee.org/ielx5/7129/19210/00888774.pdf?tp=\&arnumber=888774\&isnumber=19210 http://ieeexplore.ieee.org/xpls/abs\_all.jsp?arnumber=888774\&tag=1}, volume = {2}, year = {2000} }  @article{Chatfield2014, abstract = {The latest generation of Convolutional Neural Networks (CNN) have achieved impressive results in challenging benchmarks on image recognition and object detection, significantly raising the interest of the community in these methods. Nevertheless, it is still unclear how different CNN methods compare with each other and with previous state-of-the-art shallow representations such as the Bag-of-Visual-Words and the Improved Fisher Vector. This paper conducts a rigorous evaluation of these new techniques, exploring different deep architectures and comparing them on a common ground, identifying and disclosing important implementation details. We identify several useful properties of CNN-based representations, including the fact that the dimensionality of the CNN output layer can be reduced significantly without having an adverse effect on performance. We also identify aspects of deep and shallow methods that can be successfully shared. A particularly significant one is data augmentation, which achieves a boost in performance in shallow methods analogous to that observed with CNN-based methods. Finally, we are planning to provide the configurations and code that achieve the state-of-the-art performance on the PASCAL VOC Classification challenge, along with alternative configurations trading-off performance, computation speed and compactness.}, archiveprefix = {arXiv}, arxivid = {arXiv:1405.3531v2}, author = {Chatfield, Ken and Simonyan, Karen}, eprint = {arXiv:1405.3531v2}, file = {:share/imagedb/perellm1/references/Chatfield, Simonyan\_2014\_Return of the Devil in the Details Delving Deep into Convolutional Nets.pdf:pdf}, journal = {arXiv preprint arXiv: \ldots}, keywords = {mscthesis}, mendeley-tags = {mscthesis}, pages = {1--11}, title = {{Return of the Devil in the Details: Delving Deep into Convolutional Nets}}, url = {http://arxiv.org/abs/1405.3531}, year = {2014} }  @article{Crevier1993, address = {New York, NY, USA}, author = {Crevier, Daniel}, isbn = {0-465-02997-3}, keywords = {mscthesis}, mendeley-tags = {mscthesis}, publisher = {Basic Books, Inc.}, shorttitle = {AI}, title = {{AI: The tumultuous history of the search for artificial intelligence}}, url = {http://dl.acm.org/citation.cfm?id=151188}, year = {1993} }  @book{Deng2014, abstract = {This book is aimed to provide an overview of general deep learning methodology and its applications to a variety of signal and information processing tasks. The application areas are chosen with the following three criteria: 1) expertise or knowledge of the authors; 2) the application areas that have already been transformed by the successful use of deep learning technology, such as speech recognition and computer vision; and 3) the application areas that have the potential to be impacted significantly by deep learning and that have gained concentrated research efforts, including natural language and text processing, information retrieval, and multimodal information processing empowered by multi-task deep learning. In Chapter 1, we provide the background of deep learning, as intrinsically connected to the use of multiple layers of nonlinear transformations to derive features from the sensory signals such as speech and visual images. In the most recent literature, deep learning is embodied also as representation learning, which involves a hierarchy of features or concepts where higher-level representations of them are defined from lower-level ones and where the same lower-level representations help to define higher-level ones. In Chapter 2, a brief historical account of deep learning is presented. In particular, selected chronological development of speech recognition is used to illustrate the recent impact of deep learning that has become a dominant technology in speech recognition industry within only a few years since the start of a collaboration between academic and industrial researchers in applying deep learning to speech recognition. In Chapter 3, a three-way classification scheme for a large body of work in deep learning is developed. We classify a growing number of deep learning techniques into unsupervised, supervised, and hybrid categories, and present qualitative descriptions and a literature survey for each category. From Chapter 4 to Chapter 6, we discuss in detail three popular deep networks and related learning methods, one in each category. Chapter 4 is devoted to deep autoencoders as a prominent example of the unsupervised deep learning techniques. Chapter 5 gives a major example in the hybrid deep network category, which is the discriminative feed-forward neural network for supervised learning with many layers initialized using layer-by-layer generative, unsupervised pre-training. In Chapter 6, deep stacking networks and several of the variants are discussed in detail, which exemplify the discriminative or supervised deep learning techniques in the three-way categorization scheme. In Chapters 7-11, we select a set of typical and successful applications of deep learning in diverse areas of signal and information processing and of applied artificial intelligence. In Chapter 7, we review the applications of deep learning to speech and audio processing, with emphasis on speech recognition organized according to several prominent themes. In Chapters 8, we present recent results of applying deep learning to language modeling and natural language processing. Chapter 9 is devoted to selected applications of deep learning to information retrieval including Web search. In Chapter 10, we cover selected applications of deep learning to image object recognition in computer vision. Selected applications of deep learning to multi-modal processing and multi-task learning are reviewed in Chapter 11. Finally, an epilogue is given in Chapter 12 to summarize what we presented in earlier chapters and to discuss future challenges and directions.}, author = {Deng, Li and Yu, Dong}, file = {:home/perellm1/.local/share/data/Mendeley Ltd./Mendeley Desktop/Downloaded/Deng, Yu - 2014 - Deep Learning Methods and Applications.pdf:pdf;:home/perellm1/.local/share/data/Mendeley Ltd./Mendeley Desktop/Downloaded/Deng, Yu - 2014 - Deep Learning Methods and Applications(2).pdf:pdf}, isbn = {9781405161251}, keywords = {mscthesis}, mendeley-tags = {mscthesis}, pages = {1--134}, title = {{Deep Learning: Methods and Applications}}, url = {http://research.microsoft.com/pubs/209355/NOW-Book-Revised-Feb2014-online.pdf}, year = {2014} }  @book{Ham2000, abstract = {From the Publisher:This exciting new text covers artificial neural networks,but more specifically,neurocomputing. Neurocomputing is concerned with processing information,which involves a learning process within an artificial neural network architecture. This neural architecture responds to inputs according to a defined learning rule and then the trained network can be used to perform certain tasks depending on the application. Neurocomputing can play an important role in solving certain problems such as pattern recognition,optimization,event classification,control and identification of nonlinear systems,and statistical analysis. "Principles of Neurocomputing for Science and Engineering," unlike other neural networks texts,is written specifically for scientists and engineers who want to apply neural networks to solve complex problems. For each neurocomputing concept,a solid mathematical foundation is presented along with illustrative examples to accompany that particular architecture and associated training algorithm. The book is primarily intended for graduate-level neural networks courses,but in some instances may be used at the undergraduate level. The book includes many detailed examples and an extensive set of end-of-chapter problems.}, author = {Ham, Fredric M. and Kostanic, Ivica}, edition = {1st}, file = {:share/imagedb/perellm1/references/Ham, Kostanic\_2000\_Principles of Neurocomputing for Science and Engineering.pdf:pdf}, isbn = {0070259666}, keywords = {mscthesis}, mendeley-tags = {mscthesis}, publisher = {McGraw-Hill Higher Education}, title = {{Principles of Neurocomputing for Science and Engineering}}, year = {2000} }  @article{Hoffman2014, abstract = {A major challenge in scaling object detection is the difficulty of obtaining labeled images for large numbers of categories. Recently, deep convolutional neural networks (CNNs) have emerged as clear winners on object classification benchmarks, in part due to training with 1.2M+ labeled classification images. Unfortunately, only a small fraction of those labels are available for the detection task. It is much cheaper and easier to collect large quantities of image-level labels from search engines than it is to collect detection data and label it with precise bounding boxes. In this paper, we propose Large Scale Detection through Adaptation (LSDA), an algorithm which learns the difference between the two tasks and transfers this knowledge to classifiers for categories without bounding box annotated data, turning them into detectors. Our method has the potential to enable detection for the tens of thousands of categories that lack bounding box annotations, yet have plenty of classification data. Evaluation on the ImageNet LSVRC-2013 detection challenge demonstrates the efficacy of our approach. This algorithm enables us to produce a >7.6K detector by using available classification data from leaf nodes in the ImageNet tree. We additionally demonstrate how to modify our architecture to produce a fast detector (running at 2fps for the 7.6K detector). Models and software are available at}, annote = {$\backslash$begin\{itemize\}$\backslash$item Converting a classifier into a detector$\backslash$item ImageNet only contains 200 annotated classes for detection$\backslash$item Other approaches Multiple Instance Learning$\backslash$item Take Alexnet change last layer to desired number of classes and finetune$\backslash$item Finetune for detection using also background class$\backslash$item Compute category score as score\_category - score\_background$\backslash$item Experiment$\backslash$begin\{itemize\}$\backslash$item ILSVRC2013 detection dataset$\backslash$item 1.000 images per class$\backslash$item 200 categories$\backslash$item val1 : 100 categories with bounding box for detection training$\backslash$item val2 : 100 categories for evaluation$\backslash$end\{itemize\}$\backslash$item Results$\backslash$begin\{itemize\}$\backslash$item full LSDA 50$\backslash$\% relative mAP boost over only classifier$\backslash$item Classifier only focus on most discriminative parts (Ex: face of an animal)$\backslash$item After detection finetuning detects all the body$\backslash$item False positive errors$\backslash$begin\{itemize\}$\backslash$item localization errors (Loc):$\backslash$item confusion with background (BG):$\backslash$item other (Oth): Most of errors because confusion of the class$\backslash$end\{itemize\}$\backslash$end\{itemize\}$\backslash$item They released the 7.6K model for detection in lsda.berkeleyvision.org$\backslash$item minimize the gap between classifiers and detectors$\backslash$end\{itemize\}}, author = {Hoffman, Judy and Guadarrama, Sergio and Tzeng, ES}, file = {:home/perellm1/.local/share/data/Mendeley Ltd./Mendeley Desktop/Downloaded/27UXR89J/Hoffman et al. - 2014 - LSDA Large Scale Detection Through Adaptation.pdf:pdf;:home/perellm1/.local/share/data/Mendeley Ltd./Mendeley Desktop/Downloaded/DSAMNJJM/1407.html:html}, journal = {Advances in Neural \ldots}, keywords = {Computer Science - Computer Vision and Pattern Rec,mscthesis}, mendeley-tags = {Computer Science - Computer Vision and Pattern Rec,mscthesis}, month = jul, shorttitle = {LSDA}, title = {{LSDA: Large Scale Detection through Adaptation}}, url = {http://arxiv.org/abs/1407.5035 http://www.arxiv.org/pdf/1407.5035.pdf http://papers.nips.cc/paper/5498-lsda-large-scale-detection-through-adaptation}, year = {2014} }  @article{HowardC.Warren1921, author = {{Howard C. Warren}}, file = {:share/imagedb/perellm1/references//Howard C. Warren\_1921\_A history of the association psychology.pdf:pdf}, keywords = {mscthesis}, language = {eng}, mendeley-tags = {mscthesis}, pages = {355}, publisher = {Charles Scribner's Sons}, title = {{A history of the association psychology}}, url = {http://scholar.google.com/scholar?hl=en\&btnG=Search\&q=intitle:A+History+Of+The+Association+Psychology\#0 http://archive.org/details/historyoftheasso007979mbp}, year = {1921} }  @article{Hyvarinen2000a, abstract = {A fundamental problem in neural network research, as well as in many other disciplines, is finding a suitable representation of multivariate data, i.e. random vectors. For reasons of computational and conceptual simplicity, the representation is often sought as a linear transformation of the original data. In other words, each component of the representation is a linear combination of the original variables. Well-known linear transformation methods include principal component analysis, factor analysis, and projection pursuit. Independent component analysis (ICA) is a recently developed method in which the goal is to find a linear representation of non-Gaussian data so that the components are statistically independent, or as independent as possible. Such a representation seems to capture the essential structure of the data in many applications, including feature extraction and signal separation. In this paper, we present the basic theory and applications of ICA, and our recent work on the subject.}, author = {Hyv\"{a}rinen, a and Oja, E}, file = {:home/perellm1/.local/share/data/Mendeley Ltd./Mendeley Desktop/Downloaded/Hyv\"{a}rinen, Oja - 2000 - Independent component analysis algorithms and applications.pdf:pdf}, issn = {0893-6080}, journal = {Neural networks : the official journal of the International Neural Network Society}, keywords = {Algorithms,Artifacts,Brain,Brain: physiology,Humans,Magnetoencephalography,Neural Networks (Computer),Normal Distribution,mscthesis}, mendeley-tags = {mscthesis}, number = {4-5}, pages = {411--30}, pmid = {10946390}, title = {{Independent component analysis: algorithms and applications.}}, url = {http://www.ncbi.nlm.nih.gov/pubmed/10946390}, volume = {13}, year = {2000} }  @article{Kadir2001, abstract = {Many computer vision problems can be considered to consist of two main tasks: the extraction of image content descriptions and their subsequent matching. The appropriate choice of type and level of description is of course task dependent, yet it is generally accepted that the low-level or so called early vision layers in the Human Visual System are context independent. This paper concentrates on the use of low-level approaches for solving computer vision problems and discusses three inter-related aspects of this: saliency; scale selection and content description. In contrast to many previous approaches which separate these tasks, we argue that these three aspects are intrinsically related. Based on this observation, a multiscale algorithm for the selection of salient regions of an image is introduced and its application to matching type problems such as tracking, object recognition and image retrieval is demonstrated.}, author = {Kadir, Timor and Brady, Michael}, doi = {10.1023/A:1012460413855}, file = {:home/perellm1/.local/share/data/Mendeley Ltd./Mendeley Desktop/Downloaded/4PSMIEB8/Kadir and Brady - 2001 - Saliency, Scale and Image Description.pdf:pdf;:home/perellm1/.local/share/data/Mendeley Ltd./Mendeley Desktop/Downloaded/MVH8KZ6J/A1012460413855.html:html}, issn = {0920-5691, 1573-1405}, journal = {International Journal of Computer Vision}, keywords = {Artificial Intelligence (incl. Robotics),Automation and Robotics,Computer Imaging- Graphics and Computer Vision,Entropy,Image Processing,feature extraction,image content descriptors,image database,mscthesis,salient features,scale selection,scale-space,visual saliency}, language = {en}, mendeley-tags = {Artificial Intelligence (incl. Robotics),Automation and Robotics,Computer Imaging- Graphics and Computer Vision,Entropy,Image Processing,feature extraction,image content descriptors,image database,mscthesis,salient features,scale selection,scale-space,visual saliency}, month = nov, number = {2}, pages = {83--105}, title = {{Saliency, Scale and Image Description}}, url = {http://link.springer.com/article/10.1023/A:1012460413855 http://link.springer.com/content/pdf/10.1023/A:1012460413855.pdf}, volume = {45}, year = {2001} }  @article{Lazebnik2005, abstract = {This paper introduces a texture representation suitable for recognizing images of textured surfaces under a wide range of transformations, including viewpoint changes and nonrigid deformations. At the feature extraction stage, a sparse set of affine Harris and Laplacian regions is found in the image. Each of these regions can be thought of as a texture element having a characteristic elliptic shape and a distinctive appearance pattern. This pattern is captured in an affine-invariant fashion via a process of shape normalization followed by the computation of two novel descriptors, the spin image and the RIFT descriptor. When affine invariance is not required, the original elliptical shape serves as an additional discriminative feature for texture recognition. The proposed approach is evaluated in retrieval and classification tasks using the entire Brodatz database and a publicly available collection of 1,000 photographs of textured surfaces taken from different viewpoints.}, author = {Lazebnik, S. and Schmid, C. and Ponce, J.}, doi = {10.1109/TPAMI.2005.151}, file = {:home/perellm1/.local/share/data/Mendeley Ltd./Mendeley Desktop/Downloaded/RAGAFU7T/Lazebnik et al. - 2005 - A sparse texture representation using local affine.pdf:pdf;:home/perellm1/.local/share/data/Mendeley Ltd./Mendeley Desktop/Downloaded/I5VGUXH2/abs\_all.html:html}, issn = {0162-8828}, journal = {IEEE Transactions on Pattern Analysis and Machine Intelligence}, keywords = {Algorithms,Brodatz database,Cluster Analysis,Computer Graphics,Computer vision,Detectors,Image Enhancement,Image Interpretation- Computer-Assisted,Image analysis,Image databases,Image recognition,Image texture analysis,Index Terms- Image processing and computer vision,Information Storage and Retrieval,Information retrieval,Laplacian regions,Numerical Analysis- Computer-Assisted,Pattern Recognition- Automated,Shape,Spatial databases,Surface texture,affine-invariant fashion,artificial intelligence,distinctive appearance pattern,elliptic shape,elliptical shape,feature extraction,feature measurement,image representation,image texture,local affine regions,mscthesis,pattern recognition.,shape normalization,sparse set,sparse texture representation,texture,texture element,texture recognition,visual databases}, mendeley-tags = {Algorithms,Brodatz database,Cluster Analysis,Computer Graphics,Computer vision,Detectors,Image Enhancement,Image Interpretation- Computer-Assisted,Image analysis,Image databases,Image recognition,Image texture analysis,Index Terms- Image processing and computer vision,Information Storage and Retrieval,Information retrieval,Laplacian regions,Numerical Analysis- Computer-Assisted,Pattern Recognition- Automated,Shape,Spatial databases,Surface texture,affine-invariant fashion,artificial intelligence,distinctive appearance pattern,elliptic shape,elliptical shape,feature extraction,feature measurement,image representation,image texture,local affine regions,mscthesis,pattern recognition.,shape normalization,sparse set,sparse texture representation,texture,texture element,texture recognition,visual databases}, month = aug, number = {8}, pages = {1265--1278}, title = {{A sparse texture representation using local affine regions}}, url = {http://ieeexplore.ieee.org/ielx5/34/31215/01453514.pdf?tp=\&arnumber=1453514\&isnumber=31215 http://ieeexplore.ieee.org/xpls/abs\_all.jsp?arnumber=1453514\&tag=1}, volume = {27}, year = {2005} }  @article{LeRoux2008, abstract = {Deep belief networks (DBN) are generative neural network models with many layers of hidden explanatory factors, recently introduced by Hinton, Osindero, and Teh (2006) along with a greedy layer-wise unsupervised learning algorithm. The building block of a DBN is a probabilistic model called a restricted Boltzmann machine (RBM), used to represent one layer of the model. Restricted Boltzmann machines are interesting because inference is easy in them and because they have been successfully used as building blocks for training deeper models. We first prove that adding hidden units yields strictly improved modeling power, while a second theorem shows that RBMs are universal approximators of discrete distributions. We then study the question of whether DBNs with more layers are strictly more powerful in terms of representational power. This suggests a new and less greedy criterion for training RBMs within DBNs.}, author = {{Le Roux}, Nicolas and Bengio, Yoshua}, doi = {10.1162/neco.2008.04-07-510}, file = {:home/perellm1/.local/share/data/Mendeley Ltd./Mendeley Desktop/Downloaded/Le Roux, Bengio - 2008 - Representational power of restricted boltzmann machines and deep belief networks.pdf:pdf}, issn = {0899-7667}, journal = {Neural computation}, keywords = {Algorithms,Animals,Computer Simulation,Computer-Assisted,Humans,Learning,Learning: physiology,Models,Neural Networks (Computer),Signal Processing,Statistical,mscthesis}, mendeley-tags = {mscthesis}, month = jun, number = {6}, pages = {1631--49}, pmid = {18254699}, title = {{Representational power of restricted boltzmann machines and deep belief networks.}}, url = {http://www.ncbi.nlm.nih.gov/pubmed/18254699}, volume = {20}, year = {2008} }  @article{LeCun1998, abstract = {Multilayer neural networks trained with the back-propagation algorithm constitute the best example of a successful gradient based learning technique. Given an appropriate network architecture, gradient-based learning algorithms can be used to synthesize a complex decision surface that can classify high-dimensional patterns, such as handwritten characters, with minimal preprocessing. This paper reviews various methods applied to handwritten character recognition and compares them on a standard handwritten digit recognition task. Convolutional neural networks, which are specifically designed to deal with the variability of 2D shapes, are shown to outperform all other techniques. Real-life document recognition systems are composed of multiple modules including field extraction, segmentation recognition, and language modeling. A new learning paradigm, called graph transformer networks (GTN), allows such multimodule systems to be trained globally using gradient-based methods so as to minimize an overall performance measure. Two systems for online handwriting recognition are described. Experiments demonstrate the advantage of global training, and the flexibility of graph transformer networks. A graph transformer network for reading a bank cheque is also described. It uses convolutional neural network character recognizers combined with global training techniques to provide record accuracy on business and personal cheques. It is deployed commercially and reads several million cheques per day}, annote = {$\backslash$begin\{itemize\}$\backslash$item LeNet-5$\backslash$item Clarification: In this paper stride'' is not mentioned, but as Krizhevsky2012 et.al. started using it, new implementations of CNN need to define its value.$\backslash$item Conv: Convolutional layer$\backslash$item Subs: Subsampling layer (summed * coefficient + bias)$\backslash$item Full: Fully connected network$\backslash$item ERBF: Euclidian Radial Basis Function units$\backslash$begin\{itemize\}$\backslash$item input 32x32 pixel image (original images are 28x28)$\backslash$item Conv1 :$\backslash$begin\{itemize\}$\backslash$item 6@28x28 filter 5x5$\backslash$item stride 1$\backslash$item Connections = \$5*5*28*28*6 + 6*28*28 = 122,304\\backslash$item Train. param. = \$5*5*6 + 6 = 156\\backslash$end\{itemize\}$\backslash$item Subs2 :$\backslash$begin\{itemize\}$\backslash$item 6@14x14 range 2x2$\backslash$item stride 2$\backslash$item Connections = \$6*28*28 + 6*14*14 = 5,880\\backslash$item Train. param. = coefficient + bias = \$6 + 6 = 156\\backslash$end\{itemize\}$\backslash$item Conv3 :$\backslash$begin\{itemize\}$\backslash$item 16@10x10 filter 5x5$\backslash$item stride 1$\backslash$item Connections \$= 6*5*5*10*10*10 + 10*10*16 = 151,600\\backslash$item Train. param. \$= 5*5*3*6 + 5*5*4*9 + 5*5*6*1 + 16 = 1,516\\backslash$item Note:$\backslash$item This layer is not completly connected, see table 1 for specific connections$\backslash$item Expected Connections \$= 6*5*5*10*10*16 + 10*10*16 = 241,600\\backslash$item Expected train. param \$= 5*5*16*6 + 16 = 2416\\backslash$end\{itemize\}$\backslash$item Subs4 :$\backslash$begin\{itemize\}$\backslash$item 16@5x5 range 2x2$\backslash$item stride 2$\backslash$item Connections \$= 16*10*10 + 16*5*5 = 2,000\\backslash$item Train. param. = coefficient + bias \$= 16 + 16 = 32\\backslash$end\{itemize\}$\backslash$item Conv5 :$\backslash$begin\{itemize\}$\backslash$item 120@1x1 filter 5x5$\backslash$item stride 0$\backslash$item Connections and train. param. \$= 16*5*5*120 + 120 = 48,120\\backslash$end\{itemize\}$\backslash$item Full6 : 84 Atanh(Sa)$\backslash$begin\{itemize\}$\backslash$item Connections and train. param. \$= 120*84 + 84 = 10,164\\backslash$end\{itemize\}$\backslash$item ERBF7 : 10$\backslash$begin\{itemize\}$\backslash$item Connections and train. param. = \$84*10 = 840\\backslash$end\{itemize\}$\backslash$end\{itemize\}$\backslash$end\{itemize\}}, author = {LeCun, Y and Bottou, L}, file = {:home/perellm1/.local/share/data/Mendeley Ltd./Mendeley Desktop/Downloaded/LeCun, Bottou - 1998 - Gradient-based learning applied to document recognition.pdf:pdf}, journal = {Proceedings of the \ldots}, keywords = {CNN,mscthesis}, mendeley-tags = {CNN,mscthesis}, title = {{Gradient-based learning applied to document recognition}}, url = {http://ieeexplore.ieee.org/xpls/abs\_all.jsp?arnumber=726791 http://yann.lecun.com/exdb/publis/pdf/lecun-01a.pdf}, year = {1998} }  @incollection{Cun1986, abstract = {Threshold functions and related operators are widely used as basic elements of adaptive and associative networks [Nakano 72, Amari 72, Hopfield 82]. There exist numerous learning rules for finding a set of weights to achieve a particular correspondence between input-output pairs. But early works in the field have shown that the number of threshold functions (or linearly separable functions) in N binary variables is small compared to the number of all possible boolean mappings in N variables, especially if N is large. This problem is one of the main limitations of most neural networks models where the state is fully specified by the environment during learning: they can only learn linearly separable functions of their inputs. Moreover, a learning procedure which requires the outside world to specify the state of every neuron during the learning session can hardly be considered as a general learning rule because in real-world conditions, only a partial information on the “ideal” network state for each task is available from the environment. It is possible to use a set of so-called “hidden units” [Hinton,Sejnowski,Ackley. 84], without direct interaction with the environment, which can compute intermediate predicates. Unfortunately, the global response depends on the output of a particular hidden unit in a highly non-linear way, moreover the nature of this dependence is influenced by the states of the other cells.}, author = {LeCun, Yann}, editor = {Bienenstock, E. and Souli\'{e}, F. Fogelman and Weisbuch, G.}, file = {:home/perellm1/.local/share/data/Mendeley Ltd./Mendeley Desktop/Downloaded/UJTDSWW5/978-3-642-82657-3\_24.html:html;:share/imagedb/perellm1/references/LeCun\_1986\_Learning Process in an Asymmetric Threshold Network.pdf:pdf}, isbn = {978-3-642-82659-7, 978-3-642-82657-3}, keywords = {Artificial Intelligence (incl. Robotics),Computer Appl. in Life Sciences,Health Informatics,mscthesis}, language = {en}, mendeley-tags = {Artificial Intelligence (incl. Robotics),Computer Appl. in Life Sciences,Health Informatics,mscthesis}, month = jan, pages = {233--240}, publisher = {Springer Berlin Heidelberg}, series = {NATO ASI Series}, title = {{Learning Process in an Asymmetric Threshold Network}}, url = {http://link.springer.com/chapter/10.1007/978-3-642-82657-3\_24}, year = {1986} }  @article{Mesnil2012, abstract = {Learning good representations from a large set of unlabeled data is a particularlychallenging task. Recent work (see Bengio (2009) for a review) shows that training deep architectures is a good way to extract such representations, by extractingand disentangling gradually higher-level factors of variation characterizing the inputdistribution. In this paper, we describe different kinds of layers we trained for learning representations in the setting of the Unsupervised and Transfer Learning Challenge. The strategy of our team won the final phase of the challenge. It combined andstacked different one-layer unsupervised learning algorithms, adapted to each of thefive datasets of the competition. This paper describes that strategy and the particularone-layer learning algorithms feeding a simple linear classifier with a tiny number oflabeled training samples (1 to 64 per class).}, author = {Mesnil, G and Dauphin, Y and Glorot, X and Rifai, Salah and Bengio, Yoshua and Goodfellow, Ian and Lavoie, Erick and Muller, Xavier and Desjardins, Guillaume and Warde-Farley, David and Vincent, Pascal and Courville, Aaron and Bergstra, James}, file = {:home/perellm1/.local/share/data/Mendeley Ltd./Mendeley Desktop/Downloaded/Mesnil et al. - 2012 - Unsupervised and Transfer Learning Challenge a Deep Learning Approach.pdf:pdf}, journal = {\ldots of Machine Learning \ldots}, keywords = {Auto-Encoders,Deep Learning,Denoising Auto-Encoders.,Neural Networks,Restricted Boltzmann Machines,Transfer Learning,Unsupervised Learning,mscthesis}, mendeley-tags = {mscthesis}, pages = {1--15}, title = {{Unsupervised and Transfer Learning Challenge: a Deep Learning Approach.}}, url = {http://msn.mtome.com/Publications/CiML/CiML-v7-book.pdf\#page=119}, volume = {7}, year = {2012} }  @article{Muhlenbein2009, abstract = {In this chapter fundamental problems of collaborative computational intelligence are discussed. The problems are distilled from the seminal research of Alan Turing and John von Neumann. For Turing the creation of machines with human-like intelligence was only a question of programming time. In his research he identified the most relevant problems concerning evolutionary computation, learning, and structure of an artificial brain. Many problems are still unsolved, especially efficient higher learning methods which Turing called initiative. Von Neumann was more cautious. He doubted that human-like intelligent behavior could be described unambiguously in finite time and finite space. Von Neumann focused on self-reproducing automata to create more complex systems out of simpler ones. An early proposal from John Holland is analyzed. It centers on adaptability and population of programs. The early research of Newell, Shaw, and Simon is discussed. They use the logical calculus to discover proofs in logic. Only a few recent research projects have the broad perspectives and the ambitious goals of Turing and von Neumann. As examples the projects Cyc, Cog, and JANUS are discussed.}, author = {M\"{u}hlenbein, Heinz}, chapter = {1}, file = {:share/imagedb/perellm1/references/M\"{u}hlenbein\_2009\_Computational Intelligence The Legacy of Alan Turing and John von Neumann.pdf:pdf}, isbn = {9783642017988}, journal = {Computational Intelligence}, keywords = {mscthesis}, mendeley-tags = {mscthesis}, pages = {23--43}, shorttitle = {Computational intelligence}, title = {{Computational Intelligence: The Legacy of Alan Turing and John von Neumann}}, url = {http://publica.fraunhofer.de/documents/N-263493.html http://link.springer.com/chapter/10.1007/978-3-642-01799-5\_2}, year = {2009} }  @article{Neal1992, abstract = {Connectionist learning procedures are presented for “sigmoid” and “noisy-OR” varieties of probabilistic belief networks. These networks have previously been seen primarily as a means of representing knowledge derived from experts. Here it is shown that the “Gibbs sampling” simulation procedure for such networks can support maximum-likelihood learning from empirical data through local gradient ascent. This learning procedure resembles that used for “Boltzmann machines”, and like it, allows the use of “hidden” variables to model correlations between visible variables. Due to the directed nature of the connections in a belief network, however, the “negative phase” of Boltzmann machine learning is unnecessary. Experimental results show that, as a result, learning in a sigmoid belief network can be faster than in a Boltzmann machine. These networks have other advantages over Boltzmann machines in pattern classification and decision making applications, are naturally applicable to unsupervised learning problems, and provide a link between work on connectionist learning and work on the representation of expert knowledge.}, author = {Neal, Radford M.}, doi = {10.1016/0004-3702(92)90065-6}, file = {:home/perellm1/.local/share/data/Mendeley Ltd./Mendeley Desktop/Downloaded/TJ2U7AST/0004370292900656.html:html}, issn = {0004-3702}, journal = {Artificial Intelligence}, keywords = {mscthesis}, mendeley-tags = {mscthesis}, month = jul, number = {1}, pages = {71--113}, title = {{Connectionist learning of belief networks}}, url = {http://www.sciencedirect.com/science/article/pii/0004370292900656}, volume = {56}, year = {1992} }  @article{Over2013, author = {Over, Paul and Awad, George and Fiscus, Jon and Sanders, Greg}, file = {:home/perellm1/.local/share/data/Mendeley Ltd./Mendeley Desktop/Downloaded/Over et al. - 2013 - TRECVID 2013 – An Introduction to the Goals , Tasks , Data , Evaluation Mechanisms , and Metrics.pdf:pdf}, keywords = {mscthesis,trecvid}, mendeley-tags = {mscthesis,trecvid}, title = {{TRECVID 2013 - An Introduction to the Goals , Tasks , Data , Evaluation Mechanisms , and Metrics}}, year = {2013} }  @article{Rosenblatt1958, abstract = {To answer the questions of how information about the physical world is sensed, in what form is information remembered, and how does information retained in memory influence recognition and behavior, a theory is developed for a hypothetical nervous system called a perceptron. The theory serves as a bridge between biophysics and psychology. It is possible to predict learning curves from neurological variables and vice versa. The quantitative statistical approach is fruitful in the understanding of the organization of cognitive systems. 18 references.}, author = {Rosenblatt, F.}, doi = {10.1037/h0042519}, file = {:share/imagedb/perellm1/references/Rosenblatt\_1958\_The perceptron a probabilistic model for information storage and organization in the brain.pdf:pdf}, issn = {1939-1471(Electronic);0033-295X(Print)}, journal = {Psychological review}, keywords = {*Brain,*Cognition,*Memory,Nervous System,mscthesis}, mendeley-tags = {*Brain,*Cognition,*Memory,Nervous System,mscthesis}, number = {6}, pages = {386--408}, shorttitle = {The perceptron}, title = {{The perceptron: a probabilistic model for information storage and organization in the brain.}}, url = {http://psycnet.apa.org/journals/rev/65/6/386/}, volume = {65}, year = {1958} }  @article{Uhr1987, author = {Uhr, L}, file = {:share/imagedb/perellm1/references/Uhr\_1987\_Highly parallel, hierarchical, recognition cone perceptual structures.pdf:pdf}, journal = {Parallel computer vision}, keywords = {mscthesis}, mendeley-tags = {mscthesis}, title = {{Highly parallel, hierarchical, recognition cone perceptual structures}}, url = {http://books.google.com/books?hl=en\&lr=\&id=6pG\_WIIIFRsC\&oi=fnd\&pg=PA249\&dq=Highly+Parallel,+Hierarchical,+Recognition+Cone+Perceptual+Structures\&ots=Re5i6TmJp5\&sig=WY6n8rvdiBQ0jMG5hL\_zcg3O9Rs}, year = {1987} }  @article{Vapnik1971, author = {Vapnik, VN and Chervonenkis, AY}, doi = {10.1137/1116025}, file = {:home/perellm1/.local/share/data/Mendeley Ltd./Mendeley Desktop/Downloaded/G3FFRVT4/1116025.html:html;:share/imagedb/perellm1/references/Vapnik, Chervonenkis\_1971\_On the uniform convergence of relative frequencies of events to their probabilities.pdf:pdf}, issn = {0040-585X}, journal = {Theory of Probability \& Its Applications}, keywords = {mscthesis}, mendeley-tags = {mscthesis}, month = jan, number = {2}, pages = {264--280}, title = {{On the uniform convergence of relative frequencies of events to their probabilities}}, url = {http://epubs.siam.org/doi/abs/10.1137/1116025}, volume = {16}, year = {1971} }  @article{Werbos1974, author = {Werbos, P}, keywords = {mscthesis}, mendeley-tags = {mscthesis}, title = {{Beyond regression: new tools for prediction and analysis in the behavioral sciences}}, year = {1974} }  @article{Weston2008, abstract = {We show how nonlinear semi-supervised embedding algorithms popular for use with “shallow” learning techniques such as kernel methods can be easily applied to deep multi-layer architectures, either as a regularizer at the output layer, or on each layer of the architecture. Compared to standard supervised backpropagation this can give significant gains. This trick provides a simple alternative to existing approaches to semi-supervised deep learning whilst yielding competitive error rates compared to those methods, and existing shallow semi-supervised techniques.}, address = {New York, New York, USA}, author = {Weston, Jason and Ratle, Fr\'{e}d\'{e}ric and Collobert, Ronan}, doi = {10.1145/1390156.1390303}, file = {:home/perellm1/.local/share/data/Mendeley Ltd./Mendeley Desktop/Downloaded/Weston, Ratle, Collobert - 2008 - Deep learning via semi-supervised embedding.pdf:pdf}, isbn = {9781605582054}, journal = {Proceedings of the 25th international conference on Machine learning - ICML '08}, keywords = {mscthesis,trecvid}, mendeley-tags = {mscthesis,trecvid}, pages = {1168--1175}, publisher = {ACM Press}, title = {{Deep learning via semi-supervised embedding}}, url = {http://portal.acm.org/citation.cfm?doid=1390156.1390303 http://cse.iitk.ac.in/users/cs671/2013/hw3/weston-ratle-collobert-12\_deep-learning-via-semi-supervised-embedding.pdf}, year = {2008} }  @article{Wolf2011, abstract = {Recognizing faces in unconstrained videos is a task of mounting importance. While obviously related to face recognition in still images, it has its own unique characteristics and algorithmic requirements. Over the years several methods have been suggested for this problem, and a few benchmark data sets have been assembled to facilitate its study. However, there is a sizable gap between the actual application needs and the current state of the art. In this paper we make the following contributions. (a) We present a comprehensive database of labeled videos of faces in challenging, uncontrolled conditions (i.e., in the wild'), the YouTube Faces' database, along with benchmark, pair-matching tests1. (b) We employ our benchmark to survey and compare the performance of a large variety of existing video face recognition techniques. Finally, (c) we describe a novel set-to-set similarity measure, the Matched Background Similarity (MBGS). This similarity is shown to considerably improve performance on the benchmark tests.}, author = {Wolf, Lior and Hassner, Tal and Maoz, Itay}, doi = {10.1109/CVPR.2011.5995566}, file = {:home/perellm1/.local/share/data/Mendeley Ltd./Mendeley Desktop/Downloaded/Wolf, Hassner, Maoz - 2011 - Face Recognition in Unconstrained Videos with Matched Background Similarity.pdf:pdf}, isbn = {978-1-4577-0394-2}, journal = {Cvpr 2011}, keywords = {mscthesis}, mendeley-tags = {mscthesis}, month = jun, pages = {529--534}, publisher = {Ieee}, title = {{Face Recognition in Unconstrained Videos with Matched Background Similarity}}, url = {http://ieeexplore.ieee.org/lpdocs/epic03/wrapper.htm?arnumber=5995566}, year = {2011} }  @article{Krizhevsky2012, abstract = {We trained a large, deep convolutional neural network to classify the 1.2 millionhigh-resolution images in the ImageNet LSVRC-2010 contest into the 1000 dif-ferent classes. On the test data, we achieved top-1 and top-5 error rates of 37.5\%and 17.0\% which is considerably better than the previous state-of-the-art. Theneural network, which has 60 million parameters and 650,000 neurons, consistsof five convolutional layers, some of which are followed by max-pooling layers,and three fully-connected layers with a final 1000-way softmax. To make train-ing faster, we used non-saturating neurons and a very efficient GPU implemen-tation of the convolution operation. To reduce overfitting in the fully-connectedlayers we employed a recently-developed regularization method called “dropout”that proved to be very effective. We also entered a variant of this model in theILSVRC-2012 competition and achieved a winning top-5 test error rate of 15.3\%,compared to 26.2\% achieved by the second-best entry.}, annote = {$\backslash$begin\{itemize\}$\backslash$item CNN architecture:$\backslash$begin\{itemize\}$\backslash$item 650.000 neurons (60 million parameters)$\backslash$item 5 convolutional layers$\backslash$item Some of them followed by a max-pooling layer$\backslash$item 3 fully-connected layers$\backslash$item 1 1000-way softmax$\backslash$end\{itemize\}$\backslash$item Dropout regularization method to reduce overfitting in 3 fully-connected layers$\backslash$item Training time: 5-6 days on two GTX 580 3GB GPUs$\backslash$item Dataset:$\backslash$begin\{itemize\}$\backslash$item ILSVRC-2010$\backslash$item Down-sampled images to a fixed resolution of 256x256$\backslash$item Substract the mean activity ofver training set from each pixel$\backslash$end\{itemize\}$\backslash$item ReLU:$\backslash$begin\{itemize\}$\backslash$item \$f(x) = \backslash max(0,x)\\backslash$item Faster than tanh$\backslash$item ReLU: 6 epochs$\backslash$item tanh: 36 more epochs to achieve same performance$\backslash$end\{itemize\}$\backslash$item Local Response Normalization$\backslash$begin\{itemize\}$\backslash$item \$1.2\$and \$1.4\backslash\%\$error reduction$\backslash$item Helps generalization$\backslash$item \$b\_\{x,y\}\^{}i = a\_\{x,y\}\^{}i / \backslash left( k + \backslash alpha \backslash sum\backslash limits\_\{j=\backslash max(0,i-n/2)\}\^{}\{\backslash min(N-1,i+n/2)\}
(a\_\{x,y\}\^{}j)\^{}2 \backslash right)\^{}\backslash beta\\backslash$item \$k=2, n=5, \backslash alpha=10\^{}-4\$, and \$\backslash beta=0.75\\backslash$end\{itemize\}$\backslash$item Overlapping Pooling$\backslash$begin\{itemize\}$\backslash$item \$0.3\$and \$0.4\backslash\%\$error reduction$\backslash$item grid \$3 \backslash prod 3\\backslash$item stride = 2$\backslash$item Overlap each pooling one column pixel$\backslash$end\{itemize\}$\backslash$item Overall Architecture$\backslash$begin\{itemize\}$\backslash$item 224x224x3 (RGB image)$\backslash$item Conv 96 kernels of size 11x11x3 with stride of 4 pixels$\backslash$item Response-Normalized and max-pooling$\backslash$item Conv 256 kernels of size 5x5x48 with stride of ? pixels$\backslash$item Response-Normalized and max-pooling$\backslash$item Conv 384 kernels of size 3x3x256$\backslash$item Conv 384 kernels of size 3x3x192$\backslash$item Conv 256 kernels of size 3x3x192$\backslash$item ¿Response-Normalized? and Max-pooling$\backslash$item Fully connected 4096$\backslash$item Fully connected 4096$\backslash$item Fully connected 1000$\backslash$item Softmax$\backslash$end\{itemize\}$\backslash$figuremacro\{figures/krizhevsky2012\_convnet.pdf\}\{Architecture of the CNN\}\{\}$\backslash$item Data augmentation$\backslash$begin\{itemize\}$\backslash$item \$0.1\$error reduction$\backslash$item Original images escaled scaled and croped to 256x256$\backslash$item Extract 5 images of 224x224 from corners plus center$\backslash$item Mirror horizontally and get 5 more images$\backslash$item Augment data altering RGB channels:$\backslash$begin\{itemize\}$\backslash$item Perform PCA on RGB throughout the training set$\backslash$item Each training image add multiples of PCs with gaussian noise$\backslash$end\{itemize\}$\backslash$end\{itemize\}$\backslash$item Dropout$\backslash$begin\{itemize\}$\backslash$item Put to zero the output of neurons with probability 0.5$\backslash$item At test time multiply the outputs by 0.5$\backslash$item Two first fully-connected layers$\backslash$item Solves overfitting$\backslash$item Dobules the number of iterations required to ocnverge$\backslash$end\{itemize\}$\backslash$item Details of learning$\backslash$begin\{itemize\}$\backslash$item batch size = 128$\backslash$item momentum 0.9$\backslash$item weight decay 0.0005$\backslash$item Initial weights from zero-mean Gaussian std=0.01$\backslash$item biases = 1 on second, fourth, fifth Conv and fully-connected$\backslash$item biases = 0 on the rest$\backslash$end\{itemize\}$\backslash$item Evaluation$\backslash$begin\{itemize\}$\backslash$item Consider the feature activations induced by an image at the last, 4096-dimensional hidden layer$\backslash$end\{itemize\}$\backslash$end\{itemize\}}, author = {Krizhevsky, Alex and Sutskever, I and Hinton, GE}, file = {:home/perellm1/.local/share/data/Mendeley Ltd./Mendeley Desktop/Downloaded/Krizhevsky, Sutskever, Hinton - 2012 - ImageNet Classification with Deep Convolutional Neural Networks.pdf:pdf}, journal = {NIPS}, keywords = {imagenet,mscthesis}, mendeley-tags = {imagenet,mscthesis}, pages = {1--9}, title = {{ImageNet Classification with Deep Convolutional Neural Networks}}, url = {https://papers.nips.cc/paper/4824-imagenet-classification-with-deep-convolutional-neural-networks.pdf}, year = {2012} }  @article{Bruna2014, abstract = {Convolutional Neural Networks are extremely efficient architectures in image and audio recognition tasks, thanks to their ability to exploit the local translational invariance of signal classes over their domain. In this paper we consider possi- ble generalizations of CNNs to signals defined on more general domains without the action of a translation group. In particular, we propose two constructions, one based upon a hierarchical clustering of the domain, and another based on the spectrum of the graph Laplacian. We show through experiments that for low- dimensional graphs it is possible to learn convolutional layers with a number of parameters independent of the input size, resulting in efficient deep architectures.}, archiveprefix = {arXiv}, arxivid = {arXiv:1312.6203v2}, author = {Bruna, Joan and Szlam, Arthur and Zaremba, Wojciech and LeCun, Yann}, eprint = {arXiv:1312.6203v2}, file = {:home/perellm1/.local/share/data/Mendeley Ltd./Mendeley Desktop/Downloaded/Bruna et al. - 2014 - Spectral Networks and Deep Locally Connected Networks on Graphs.pdf:pdf}, keywords = {mscthesis}, mendeley-tags = {mscthesis}, pages = {1--14}, title = {{Spectral Networks and Deep Locally Connected Networks on Graphs}}, year = {2014} }  @book{Forsyth2002, author = {Forsyth, DA and Ponce, J}, file = {:share/imagedb/perellm1/references/Forsyth, Ponce\_2002\_Computer vision a modern approach.pdf:pdf}, keywords = {mscthesis}, mendeley-tags = {mscthesis}, title = {{Computer vision: a modern approach}}, url = {http://dl.acm.org/citation.cfm?id=580035}, year = {2002} }  @book{Friedman2001, author = {Friedman, Jerome and Hastie, Trevor and Tibshirani, Robert}, file = {:home/perellm1/.local/share/data/Mendeley Ltd./Mendeley Desktop/Downloaded/Friedman, Hastie, Tibshirani - 2001 - The elements of statistical learning(2).pdf:pdf;:home/perellm1/.local/share/data/Mendeley Ltd./Mendeley Desktop/Downloaded/Friedman, Hastie, Tibshirani - 2001 - The elements of statistical learning.pdf:pdf}, keywords = {mscthesis}, mendeley-tags = {mscthesis}, title = {{The elements of statistical learning}}, url = {http://www.stanford.edu/~hastie/local.ftp/Springer/ESLII\_print5.pdf http://www-stat.stanford.edu/~tibs/book/preface.ps http://www.stanford.edu/~hastie/local.ftp/Springer/OLD//ESLII\_print4.pdf}, year = {2001} }  @article{Goodfellow2013, abstract = {Catastrophic forgetting is a problem faced by many machine learning models and algorithms. When trained on one task, then trained on a second task, many machine learning models "forget" how to perform the first task. This is widely believed to be a serious problem for neural networks. Here, we investigate the extent to which the catastrophic forgetting problem occurs for modern neural networks, comparing both established and recent gradient-based training algorithms and activation functions. We also examine the effect of the relationship between the first task and the second task on catastrophic forgetting. We find that it is always best to train using the dropout algorithm--the dropout algorithm is consistently best at adapting to the new task, remembering the old task, and has the best tradeoff curve between these two extremes. We find that different tasks and relationships between tasks result in very different rankings of activation function performance. This suggests the choice of activation function should always be cross-validated.}, archiveprefix = {arXiv}, arxivid = {arXiv:1312.6211v2}, author = {Goodfellow, IJ and Mirza, M and Da, X and Courville, Aaron and Bengio, Yoshua}, eprint = {arXiv:1312.6211v2}, file = {:home/perellm1/.local/share/data/Mendeley Ltd./Mendeley Desktop/Downloaded/Goodfellow et al. - 2013 - An Empirical Investigation of Catastrophic Forgeting in Gradient-Based Neural Networks.pdf:pdf}, journal = {arXiv preprint arXiv: \ldots}, keywords = {mscthesis}, mendeley-tags = {mscthesis}, title = {{An Empirical Investigation of Catastrophic Forgeting in Gradient-Based Neural Networks}}, url = {http://arxiv.org/abs/1312.6211}, year = {2013} }  @incollection{Hariharan2014, abstract = {We aim to detect all instances of a category in an image and, for each instance, mark the pixels that belong to it. We call this task Simultaneous Detection and Segmentation (SDS). Unlike classical bounding box detection, SDS requires a segmentation and not just a box. Unlike classical semantic segmentation, we require individual object instances. We build on recent work that uses convolutional neural networks to classify category-independent region proposals (R-CNN [16]), introducing a novel architecture tailored for SDS. We then use category-specific, top-down figure-ground predictions to refine our bottom-up proposals. We show a 7 point boost (16\% relative) over our baselines on SDS, a 5 point boost (10\% relative) over state-of-the-art on semantic segmentation, and state-of-the-art performance in object detection. Finally, we provide diagnostic tools that unpack performance and provide directions for future work.}, annote = {$\backslash$begin\{itemize\}$\backslash$item Segment one instance in a given image$\backslash$item Work on top of region proposal R-CNN$\backslash$item Dataset MSRC$\backslash$item Mark each pixel belonging to the detected instance$\backslash$item Algorithm: Simultaneous Detection and Segmentation$\backslash$begin\{itemize\}$\backslash$item Proposal generation: 2.000 region candidates using MCG$\backslash$item Feature extraction: Extract features with pretrained CNN (Alexnet) with two paths, with and without background.$\backslash$begin\{itemize\}$\backslash$item A : Extract CNN features from box and another with background masked$\backslash$item B : Second CNN is finetuned croping the box and removing background$\backslash$item C : Finetune both networks, one with the background and the other without$\backslash$item C + ref : refining the regions obtained from C$\backslash$end\{itemize\}$\backslash$item Region classification: linear SVM using fc6$\backslash$item Region refinement: non-maximum suppression on candidates and CNN for refinement$\backslash$end\{itemize\}$\backslash$item Results$\backslash$begin\{itemize\}$\backslash$item SegDPM detection PASCAL VOC2010: C+ref increases mean AP from 31.3 to 50.3$\backslash$item Pixel IU on VOC11: advance state-of-the-art about 5 points 10$\backslash$\% relative$\backslash$end\{itemize\}$\backslash$end\{itemize\} }, author = {Hariharan, Bharath and Arbel\'{a}ez, Pablo and Girshick, Ross and Malik, Jitendra}, editor = {Fleet, David and Pajdla, Tomas and Schiele, Bernt and Tuytelaars, Tinne}, file = {:home/perellm1/.local/share/data/Mendeley Ltd./Mendeley Desktop/Downloaded/4P5U3QGP/Hariharan et al. - 2014 - Simultaneous Detection and Segmentation.pdf:pdf;:home/perellm1/.local/share/data/Mendeley Ltd./Mendeley Desktop/Downloaded/WGUIDHGV/978-3-319-10584-0\_20.html:html}, isbn = {978-3-319-10583-3, 978-3-319-10584-0}, keywords = {Artificial Intelligence (incl. Robotics),Computer Graphics,Image Processing and Computer Vision,Pattern Recognition,convolutional networks,detection,mscthesis,segmentation}, language = {en}, mendeley-tags = {Artificial Intelligence (incl. Robotics),Computer Graphics,Image Processing and Computer Vision,Pattern Recognition,convolutional networks,detection,mscthesis,segmentation}, month = jan, pages = {297--312}, publisher = {Springer International Publishing}, series = {Lecture Notes in Computer Science}, title = {{Simultaneous Detection and Segmentation}}, url = {http://link.springer.com/chapter/10.1007/978-3-319-10584-0\_20 http://link.springer.com/content/pdf/10.1007/978-3-319-10584-0\_20.pdf}, year = {2014} }  @article{Hinton2007, abstract = {The uniformity of the cortical architecture and the ability of functions to move to different areas of cortex following early damage strongly suggest that there is a single basic learning algorithm for extracting underlying structure from richly structured, high-dimensional sensory data. There have been many attempts to design such an algorithm, but until recently they all suffered from serious computational weaknesses. This chapter describes several of the proposed algorithms and shows how they can be combined to produce hybrid methods that work efficiently in networks with many layers and millions of adaptive connections.}, author = {Hinton, Geoffrey}, file = {:home/perellm1/.local/share/data/Mendeley Ltd./Mendeley Desktop/Downloaded/Hinton - 2007 - To recognize shapes, first learn to generate images.pdf:pdf}, journal = {Progress in brain research}, keywords = {mscthesis}, mendeley-tags = {mscthesis}, title = {{To recognize shapes, first learn to generate images}}, url = {http://www.sciencedirect.com/science/article/pii/S0079612306650346 http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.78.9814\&rep=rep1\&type=pdf}, year = {2007} }  @article{Le2011, abstract = {We consider the problem of building high-level, class-specific feature detectors from only unlabeled data. For example, is it possible to learn a face detector using only unlabeled images? To answer this, we train a deep sparse autoencoder on a large dataset of images (the model has 1 billion connections, the dataset has 10 million 200×200 pixel images downloaded from the Internet). We train this network using model parallelism and asynchronous SGD on a cluster with 1,000 machines (16,000 cores) for three days. Contrary to what appears to be a widely-held intuition, our experimental results reveal that it is possible to train a face detector without having to label images as containing a face or not. Control experiments show that this feature detector is robust not only to translation but also to scaling and out-of-plane rotation. We also find that the same network is sensitive to other high-level concepts such as cat faces and human bodies. Starting from these learned features, we trained our network to recognize 22,000 object categories from ImageNet and achieve a leap of 70\% relative improvement over the previous state-of-the-art.}, archiveprefix = {arXiv}, arxivid = {arXiv:1112.6209v5}, author = {Le, QV and Ranzato, MA and Monga, R and Devin, Matthieu}, eprint = {arXiv:1112.6209v5}, file = {:home/perellm1/.local/share/data/Mendeley Ltd./Mendeley Desktop/Downloaded/Le et al. - 2011 - Building high-level features using large scale unsupervised learning.pdf:pdf}, journal = {arXiv preprint arXiv: \ldots}, keywords = {CNN,mscthesis}, mendeley-tags = {CNN,mscthesis}, title = {{Building high-level features using large scale unsupervised learning}}, url = {http://arxiv.org/pdf/1112.6209.pdf}, year = {2011} }  @inproceedings{Lowe1999, abstract = {An object recognition system has been developed that uses a new class of local image features. The features are invariant to image scaling, translation, and rotation, and partially invariant to illumination changes and affine or 3D projection. These features share similar properties with neurons in inferior temporal cortex that are used for object recognition in primate vision. Features are efficiently detected through a staged filtering approach that identifies stable points in scale space. Image keys are created that allow for local geometric deformations by representing blurred image gradients in multiple orientation planes and at multiple scales. The keys are used as input to a nearest neighbor indexing method that identifies candidate object matches. Final verification of each match is achieved by finding a low residual least squares solution for the unknown model parameters. Experimental results show that robust object recognition can be achieved in cluttered partially occluded images with a computation time of under 2 seconds}, author = {Lowe, D.G.}, doi = {10.1109/ICCV.1999.790410}, file = {:home/perellm1/.local/share/data/Mendeley Ltd./Mendeley Desktop/Downloaded/2RVIWNSV/Lowe - 1999 - Object recognition from local scale-invariant feat.pdf:pdf;:home/perellm1/.local/share/data/Mendeley Ltd./Mendeley Desktop/Downloaded/INM2HT4W/abs\_all.html:html}, keywords = {3D projection,Computer science,Electrical capacitance tomography,Filters,Image recognition,Layout,Lighting,Neurons,Programmable logic arrays,Reactive power,blurred image gradients,candidate object matches,cluttered partially occluded images,computation time,computational geometry,feature extraction,image matching,inferior temporal cortex,least squares approximations,local geometric deformations,local image features,local scale-invariant features,low residual least squares solution,multiple orientation planes,nearest neighbor indexing method,object recognition,primate vision,robust object recognition,staged filtering approach,unknown model parameters}, mendeley-tags = {3D projection,Computer science,Electrical capacitance tomography,Filters,Image recognition,Layout,Lighting,Neurons,Programmable logic arrays,Reactive power,blurred image gradients,candidate object matches,cluttered partially occluded images,computation time,computational geometry,feature extraction,image matching,inferior temporal cortex,least squares approximations,local geometric deformations,local image features,local scale-invariant features,low residual least squares solution,multiple orientation planes,nearest neighbor indexing method,object recognition,primate vision,robust object recognition,staged filtering approach,unknown model parameters}, pages = {1150--1157 vol.2}, title = {{Object recognition from local scale-invariant features}}, url = {http://ieeexplore.ieee.org/ielx5/6412/17141/00790410.pdf?tp=\&arnumber=790410\&isnumber=17141 http://ieeexplore.ieee.org/xpls/abs\_all.jsp?arnumber=790410}, volume = {2}, year = {1999} }  @article{Mann1947, abstract = {Let xx and yy be two random variables with continuous cumulative distribution functions ff and gg. A statistic UU depending on the relative ranks of the xx's and yy's is proposed for testing the hypothesis f=gf = g. Wilcoxon proposed an equivalent test in the Biometrics Bulletin, December, 1945, but gave only a few points of the distribution of his statistic. Under the hypothesis f=gf = g the probability of obtaining a given UU in a sample of nx′sn x's and my′sm y's is the solution of a certain recurrence relation involving nn and mm. Using this recurrence relation tables have been computed giving the probability of UU for samples up to n=m=8n = m = 8. At this point the distribution is almost normal. From the recurrence relation explicit expressions for the mean, variance, and fourth moment are obtained. The 2rth moment is shown to have a certain form which enabled us to prove that the limit distribution is normal if m,nm, n go to infinity in any arbitrary manner. The test is shown to be consistent with respect to the class of alternatives f(x)>g(x)f(x) > g(x) for every xx.}, author = {Mann, HB and Whitney, DR}, doi = {10.1214/aoms/1177730491}, file = {:share/imagedb/perellm1/references/Mann, Whitney\_1947\_On a test of whether one of two random variables is stochastically larger than the other.pdf:pdf}, issn = {0003-4851, 2168-8990}, journal = {The annals of mathematical statistics}, keywords = {mscthesis}, language = {EN}, mendeley-tags = {mscthesis}, month = mar, number = {1}, pages = {50--60}, title = {{On a test of whether one of two random variables is stochastically larger than the other}}, url = {http://projecteuclid.org/euclid.aoms/1177730491 http://www.jstor.org/stable/2236101}, volume = {18}, year = {1947} }  @article{Mikolajczyk2005, abstract = {In this paper, we compare the performance of local detectors and descriptors in the context of object class recognition. Recently, many detectors/descriptors have been evaluated in the context of matching as well as invariance to viewpoint changes (Mikolajczyk and Schmid, 2004). However, it is unclear if these results can be generalized to categorization problems, which require different properties of features. We evaluate 5 state-of-the-art scale invariant region detectors and 5 descriptors. Local features are computed for 20 object classes and clustered using hierarchical agglomerative clustering. We measure the quality of appearance clusters and location distributions using entropy as well as precision. We also measure how the clusters generalize from training set to novel test data. Our results indicate that attended SIFT descriptors (Mikolajczyk and Schmid, 2005) computed on Hessian-Laplace regions perform best. Second score is obtained by salient regions (Kadir and Brady, 2001). The results also show that these two detectors provide complementary features. The new detectors/descriptors significantly improve the performance of a state-of-the art recognition approach (Leibe, et al., 2005) in pedestrian detection task}, author = {Mikolajczyk, K. and Leibe, B. and Schiele, B.}, doi = {10.1109/ICCV.2005.146}, file = {:home/perellm1/.local/share/data/Mendeley Ltd./Mendeley Desktop/Downloaded/3FMQ8TXE/Mikolajczyk et al. - 2005 - Local features for object class recognition.pdf:pdf;:home/perellm1/.local/share/data/Mendeley Ltd./Mendeley Desktop/Downloaded/DBA4ZV59/abs\_all.html:html}, journal = {Computer Vision, 2005. \ldots}, keywords = {Art,Computer vision,Detectors,Entropy,Hessian-Laplace regions,Image recognition,Interactive systems,Object detection,Photometry,SIFT descriptors,Testing,categorization problems,feature extraction,hierarchical agglomerative clustering,mscthesis,object class recognition,object recognition,pattern classification,pattern clustering,pedestrian detection,salient regions,scale invariant region detectors}, mendeley-tags = {Art,Computer vision,Detectors,Entropy,Hessian-Laplace regions,Image recognition,Interactive systems,Object detection,Photometry,SIFT descriptors,Testing,categorization problems,feature extraction,hierarchical agglomerative clustering,mscthesis,object class recognition,object recognition,pattern classification,pattern clustering,pedestrian detection,salient regions,scale invariant region detectors}, month = oct, pages = {1792--1799 Vol. 2}, title = {{Local features for object class recognition}}, url = {http://ieeexplore.ieee.org/ielx5/10347/32976/01544934.pdf?tp=\&arnumber=1544934\&isnumber=32976 http://ieeexplore.ieee.org/xpls/abs\_all.jsp?arnumber=1544934\&tag=1 http://ieeexplore.ieee.org/xpls/abs\_all.jsp?arnumber=1544934}, volume = {2}, year = {2005} }  @book{Nilsson1965, author = {Nilsson, Nils J.}, file = {:share/imagedb/perellm1/references/Nilsson\_1965\_Learning machines foundations of trainable pattern-classifying systems.pdf:pdf}, keywords = {Computers / Intelligence (AI) \& Semantics,mscthesis}, language = {en}, mendeley-tags = {Computers / Intelligence (AI) \& Semantics,mscthesis}, pages = {160}, publisher = {McGraw-Hill}, shorttitle = {Learning machines}, title = {{Learning machines: foundations of trainable pattern-classifying systems}}, url = {http://books.google.fi/books?id=YMdfAAAAMAAJ}, year = {1965} }  @article{Sermanet2014, abstract = {We present an integrated framework for using Convolutional Networks for classification, localization and detection. We show how a multiscale and sliding window approach can be efficiently implemented within a ConvNet. We also introduce a novel deep learning approach to localization by learning to predict object boundaries. Bounding boxes are then accumulated rather than suppressed in order to increase detection confidence. We show that different tasks can be learned simultaneously using a single shared network. This integrated framework is the winner of the localization task of the ImageNet Large Scale Visual Recognition Challenge 2013 (ILSVRC2013) and obtained very competitive results for the detection and classifications tasks. In post-competition work, we establish a new state of the art for the detection task. Finally, we release a feature extractor from our best model called OverFeat.}, annote = {$\backslash$begin\{itemize\}$\backslash$item Framework for using CNN$\backslash$begin\{itemize\}$\backslash$item classification$\backslash$item localization$\backslash$item detection$\backslash$end\{itemize\}$\backslash$item Winner on localization task of ILSVRC2013$\backslash$item ConvNets are trained enterily with the raw pixels$\backslash$item Other approaches for detection and localization$\backslash$item appling a sliding window over multiples scales$\backslash$item$\backslash$dots$\backslash$item$\backslash$dots$\backslash$end\{itemize\}}, archiveprefix = {arXiv}, arxivid = {arXiv:1312.6229v3}, author = {Sermanet, Pierre and Eigen, David and Zhang, X and Mathieu, Michael and Fergus, Rob and LeCun, Yann}, eprint = {arXiv:1312.6229v3}, file = {:home/perellm1/.local/share/data/Mendeley Ltd./Mendeley Desktop/Downloaded/Sermanet et al. - 2014 - OverFeat Integrated Recognition, Localization and Detection using Convolutional Networks.pdf:pdf}, journal = {arXiv preprint arXiv: \ldots}, keywords = {CNN,mscthesis}, mendeley-tags = {CNN,mscthesis}, pages = {1--16}, title = {{OverFeat: Integrated Recognition, Localization and Detection using Convolutional Networks}}, url = {http://arxiv.org/abs/1312.6229 http://arxiv.org/pdf/1312.6229.pdf}, year = {2014} }  @book{Shannon1972, author = {Shannon, Claude Elwood and McCarthy, John}, isbn = {0691079161}, keywords = {Technology \& Engineering / Electrical,mscthesis}, language = {en}, mendeley-tags = {Technology \& Engineering / Electrical,mscthesis}, pages = {300}, publisher = {Princeton University Press}, shorttitle = {Automata Studies}, title = {{Automata Studies: Annals of Mathematics Studies. Number 34}}, url = {http://books.google.fi/books?id=oL57iECEeEwC}, year = {1972} }  @article{Simard2003, abstract = {Neural networks are a powerful technology forclassification of visual inputs arising from documents.However, there is a confusing plethora of different neuralnetwork methods that are used in the literature and inindustry. This paper describes a set of concrete bestpractices that document analysis researchers can use toget good results with neural networks. The mostimportant practice is getting a training set as large aspossible: we expand the training set by adding a newform of distorted data. The next most important practiceis that convolutional neural networks are better suited forvisual document tasks than fully connected networks. Wepropose that a simple “do-it-yourself” implementation ofconvolution with a flexible architecture is suitable formany visual document problems. This simpleconvolutional neural network does not require complexmethods, such as momentum, weight decay, structure-dependent learning rates, averaging layers, tangent prop,or even finely-tuning the architecture. The end result is avery simple yet general architecture which can yieldstate-of-the-art performance for document analysis. Weillustrate our claims on the MNIST set of English digitimages.}, annote = {$\backslash$begin\{itemize\}$\backslash$item Get a training set as large as possible$\backslash$item No need of complex methods, such as momentum, weight decay, structure-dependent learning rates, averaging layers, tangent prop, or even finely-tuning the architecture$\backslash$item Increment dataset by:$\backslash$begin\{itemize\}$\backslash$item Affine transformations: translations, scaling, homothety, similarity transformation, reflection, rotation, shear mapping, and compositions.$\backslash$item Elastic distortions$\backslash$end\{itemize\}$\backslash$item In this paper the authors justify the use of elastic deformations on MNIST data corresponding to uncontrolled oscillations of the hand muscles, dampened by inertia.$\backslash$item They get the best results on MNIST to date with CNN, affine and elastic transformations of the dataset (0.4$\backslash$\% error).$\backslash$end\{itemize\}}, author = {Simard, P and Steinkraus, Dave and Platt, JC}, file = {:home/perellm1/.local/share/data/Mendeley Ltd./Mendeley Desktop/Downloaded/Simard, Steinkraus, Platt - 2003 - Best Practices for Convolutional Neural Networks Applied to Visual Document Analysis.pdf:pdf}, journal = {ICDAR}, keywords = {CNN,mscthesis}, mendeley-tags = {CNN,mscthesis}, title = {{Best Practices for Convolutional Neural Networks Applied to Visual Document Analysis}}, url = {http://vnlab.ce.sharif.ir/courses/85-86/2/ce667/resources/root/15 - Convolutional N. N./ICDAR03.pdf}, year = {2003} }  @article{Ackley1985, abstract = {The computational power of massively parallel networks of simple processing elements resides in the communication bandwidth provided by the hardware connections between elements. These connections can allow a significant fraction of the knowledge of the system to be applied to an instance of a problem in a very short time. One kind of computation for which massively parallel networks appear to be well suited is large constraint satisfaction searches, but to use the connections efficiently two conditions must be met: First, a search technique that is suitable for parallel networks must be found. Second, there must be some way of choosing internal representations which allow the preexisting hardware connections to be used efficiently for encoding the constraints in the domain being searched. We describe a general parallel search method, based on statistical mechanics, and we show how it leads to a general learning rule for modifying the connection strengths so as to incorporate knowledge about a task domain in an efficient way. We describe some simple examples in which the learning algorithm creates internal representations that are demonstrably the most efficient way of using the preexisting connectivity structure.}, author = {Ackley, David H. and Hinton, Geoffrey E. and Sejnowski, Terrence J.}, doi = {10.1207/s15516709cog0901\_7}, file = {:home/perellm1/.local/share/data/Mendeley Ltd./Mendeley Desktop/Downloaded/Z5B29HBN/Ackley et al. - 1985 - A Learning Algorithm for Boltzmann Machines.pdf:pdf;:home/perellm1/.local/share/data/Mendeley Ltd./Mendeley Desktop/Downloaded/WIG2MVZP/abstract.html:html}, issn = {1551-6709}, journal = {Cognitive Science}, keywords = {mscthesis}, language = {en}, mendeley-tags = {mscthesis}, month = jan, number = {1}, pages = {147--169}, title = {{A Learning Algorithm for Boltzmann Machines*}}, url = {http://onlinelibrary.wiley.com/doi/10.1207/s15516709cog0901\_7/abstract http://onlinelibrary.wiley.com/store/10.1207/s15516709cog0901\_7/asset/s15516709cog0901\_7.pdf?v=1\&t=i220c3ac\&s=ab7f799bbf6a45c35bb21e0dab7e5e4dff07dd3a}, volume = {9}, year = {1985} }  @article{Agrawal1993a, abstract = {We are given a large database of customer transactions. Each transaction consists of items purchased by a customer in a visit. We present an efficient algorithm that generates all significant association rules between items in the database. The algorithm incorporates buffer management and novel estimation and pruning techniques. We also present results of applying this algorithm to sales data obtained from a large retailing company, which shows the effectiveness of the algorithm.}, address = {New York, NY, USA}, author = {Agrawal, Rakesh and Imielinski, Tomasz and Swami, Arun}, doi = {10.1145/170035.170072}, file = {:home/perellm1/.local/share/data/Mendeley Ltd./Mendeley Desktop/Downloaded/V25SZ7TN/Agrawal et al. - 1993 - Mining Association Rules Between Sets of Items in .pdf:pdf}, isbn = {0-89791-592-5}, journal = {ACM SIGMOD Record}, keywords = {mscthesis}, mendeley-tags = {mscthesis}, pages = {207--216}, publisher = {ACM}, series = {SIGMOD '93}, title = {{Mining association rules between sets of items in large databases}}, url = {http://doi.acm.org/10.1145/170035.170072 http://dl.acm.org/ft\_gateway.cfm?id=170072\&type=pdf http://dl.acm.org/citation.cfm?id=170072}, year = {1993} }  @article{Bay2006, author = {Bay, Herbert and Tuytelaars, Tinne and Gool, Luc Van}, file = {:home/perellm1/.local/share/data/Mendeley Ltd./Mendeley Desktop/Downloaded/Bay, Tuytelaars, Gool - Unknown - SURF Speeded Up Robust Features.pdf:pdf}, journal = {Computer Vision-ECCV 2006}, keywords = {mscthesis}, mendeley-tags = {mscthesis}, title = {{Surf: Speeded up robust features}}, url = {http://link.springer.com/chapter/10.1007/11744023\_32}, year = {2006} }  @article{Eigen2013, abstract = {A key challenge in designing convolutional network models is sizing them appropriately. Many factors are involved in these decisions, including number of layers, feature maps, kernel sizes, etc. Complicating this further is the fact that each of these influence not only the numbers and dimensions of the activation units, but also the total number of parameters. In this paper we focus on assessing the independent contributions of three of these linked variables: The numbers of layers, feature maps, and parameters. To accomplish this, we employ a recursive convolutional network whose weights are tied between layers; this allows us to vary each of the three factors in a controlled setting. We find that while increasing the numbers of layers and parameters each have clear benefit, the number of feature maps (and hence dimensionality of the representation) appears ancillary, and finds most of its benefit through the introduction of more weights. Our results (i) empirically confirm the notion that adding layers alone increases computational power, within the context of convolutional layers, and (ii) suggest that precise sizing of convolutional feature map dimensions is itself of little concern; more attention should be paid to the number of parameters in these layers instead.}, annote = {$\backslash$begin\{itemize\}$\backslash$item Deeper models are preferred over shallow ones$\backslash$item Performance is independent of the number of units, when depth and parameters remains constant$\backslash$item Recurrent Neural Network:$\backslash$begin\{itemize\}$\backslash$item Convolutional architecture$\backslash$item all layers same number of feature maps$\backslash$item weights are tied across layers$\backslash$item ReLU in all layers$\backslash$item Max-pooling with non-overlaping windows$\backslash$end\{itemize\}$\backslash$end\{itemize\}}, archiveprefix = {arXiv}, arxivid = {arXiv:1312.1847v1}, author = {Eigen, David and Rolfe, Jason and Fergus, Rob and LeCun, Y}, eprint = {arXiv:1312.1847v1}, file = {:home/perellm1/.local/share/data/Mendeley Ltd./Mendeley Desktop/Downloaded/Eigen et al. - 2013 - Understanding Deep Architectures using a Recursive Convolutional Network.pdf:pdf}, journal = {arXiv preprint arXiv:1312.1847}, keywords = {CNN,mscthesis}, mendeley-tags = {CNN,mscthesis}, pages = {1--9}, title = {{Understanding Deep Architectures using a Recursive Convolutional Network}}, url = {http://arxiv.org/abs/1312.1847 http://arxiv.org/pdf/1312.1847.pdf}, year = {2013} }  @article{Fang2014, abstract = {This paper presents a novel approach for automatically generating image descriptions: visual detectors and language models learn directly from a dataset of image captions. We use Multiple Instance Learning to train visual detectors for words that commonly occur in captions, including many different parts of speech such as nouns, verbs, and adjectives. The word detector outputs serve as conditional inputs to a maximum-entropy language model. The language model learns from a set of over 400,000 image descriptions to capture the statistics of word usage. We capture global semantics by re-ranking caption candidates using sentence-level features and a deep multimodal similarity model. When human judges compare the system captions to ones written by other people, the system captions have equal or better quality over 23\% of the time.}, annote = {Comment: Added appendix}, author = {Fang, Hao and Gupta, Saurabh and Iandola, Forrest}, file = {:home/perellm1/.local/share/data/Mendeley Ltd./Mendeley Desktop/Downloaded/WNDD4F67/Fang et al. - 2014 - From Captions to Visual Concepts and Back.pdf:pdf;:home/perellm1/.local/share/data/Mendeley Ltd./Mendeley Desktop/Downloaded/QXJCEFBU/1411.html:html}, journal = {arXiv preprint arXiv: \ldots}, keywords = {Computer Science - Computation and Language,Computer Science - Computer Vision and Pattern Rec,mscthesis}, mendeley-tags = {Computer Science - Computation and Language,Computer Science - Computer Vision and Pattern Rec,mscthesis}, month = nov, title = {{From Captions to Visual Concepts and Back}}, url = {http://arxiv.org/abs/1411.4952 http://www.arxiv.org/pdf/1411.4952.pdf}, year = {2014} }  @book{Glassner1995, author = {Glassner, AS}, file = {:share/imagedb/perellm1/references/Glassner\_1995\_Principles of digital image synthesis Vol. 1.pdf:pdf}, isbn = {9780122862519}, keywords = {mscthesis}, language = {en}, mendeley-tags = {mscthesis}, pages = {678}, publisher = {Elsevier}, shorttitle = {Principles of digital image synthesis}, title = {{Principles of digital image synthesis: Vol. 1}}, url = {http://books.google.fi/books?id=l3dnYQ\_btH4C http://books.google.com/books?hl=en\&lr=\&id=l3dnYQ\_btH4C\&oi=fnd\&pg=PR21\&dq=Principles+of+digital+image+synthesis:+Vol.+1\&ots=BTlMXt7B9E\&sig=4my5WSbTpFiYoZfVq5kLFPf0OZw http://books.google.com/books?hl=en\&lr=\&id=l3dnYQ\_btH4C\&oi=fnd\&pg=PR21\&dq=Principles+of+digital+image+synthesis:+Vol.+1\&ots=BTlMXt7CaD\&sig=m0kwpktl9lFaaLXkL0YdEuA9PaM}, year = {1995} }  @book{Glimm2006, abstract = {The ideas of John von Neumann have had a profound influence on modern mathematics and science. One of the great thinkers of our century, von Neumann initiated major branches of mathematics--from operator algebras to game theory to scientific computing--and had a fundamental impact on such areas as self-adjoint operators, ergodic theory and the foundations of quantum mechanics, and numerical analysis and the design of the modern computer. This volume contains the proceedings of an AMS Symposium in Pure Mathematics, held at Hofstra University, in May 1988. The symposium brought together some of the foremost researchers in the wide range of areas in which von Neumann worked. These articles illustrate the sweep of von Neumann's ideas and thinking and document their influence on contemporary mathematics. In addition, some of those who knew von Neumann when he was alive have presented here personal reminiscences about him. This book is directed to those interested in operator theory, game theory, ergodic theory, and scientific computing, as well as to historians of mathematics and others having an interest in the contemporary history of the mathematical sciences. This book will give readers an appreciation for the workings of the mind of one of the mathematical giants of our time.}, author = {Glimm, JG and Impagliazzo, John and Singer, Isadore}, file = {:share/imagedb/perellm1/references/Glimm, Impagliazzo, Singer\_2006\_The legacy of John von Neumann.djvu:djvu}, isbn = {9780821868164}, keywords = {Mathematics / General,mscthesis}, language = {en}, mendeley-tags = {Mathematics / General,mscthesis}, month = sep, pages = {346}, publisher = {American Mathematical Soc.}, title = {{The legacy of John von Neumann}}, url = {http://books.google.fi/books?id=XBK-r0gS0YMC http://books.google.com/books?hl=en\&lr=\&id=XBK-r0gS0YMC\&oi=fnd\&pg=PA1\&dq=The+Legacy+of+John+Von+Neumann\&ots=ygBYUuy8Oj\&sig=JnEXKPYmTaWvQWw-19ZTLEJxYME}, year = {2006} }  @incollection{GuptaMadanM.1994, author = {Gupta, Madan M and Knopf, George K}, booktitle = {Neuro-Vision Systems: Principles and Applications}, file = {:share/imagedb/perellm1/references/Gupta, Knopf\_1994\_Neuro-vision systems A tutorial(2).pdf:pdf}, keywords = {mscthesis}, mendeley-tags = {mscthesis}, pages = {1--34}, title = {{Neuro-vision systems: A tutorial.}}, url = {http://homepage.usask.ca/~mmg864/paper/CB/CB-053.pdf}, year = {1994} }  @article{Harris1988, author = {Harris, C. and Stephens, M.}, doi = {10.5244/C.2.23}, file = {:share/imagedb/perellm1/references/Harris, Stephens\_1988\_A Combined Corner and Edge Detector.pdf:pdf}, journal = {Procedings of the Alvey Vision Conference 1988}, keywords = {mscthesis}, mendeley-tags = {mscthesis}, pages = {23.1--23.6}, publisher = {Alvey Vision Club}, title = {{A Combined Corner and Edge Detector}}, url = {http://www.bmva.org/bmvc/1988/avc-88-023.html}, year = {1988} }  @article{He2014, abstract = {Existing deep convolutional neural networks (CNNs) require a fixed-size (e.g., 224x224) input image. This requirement is "artificial" and may reduce the recognition accuracy for the images or sub-images of an arbitrary size/scale. In this work, we equip the networks with a more principled pooling strategy, "spatial pyramid pooling", to eliminate the above requirement. The new network structure, called SPP-net, can generate a fixed-length representation regardless of image size/scale. Pyramid pooling is also robust to object deformations. With these advantages, SPP-net should in general improve all CNN-based image classification methods. On the ImageNet 2012 dataset, we demonstrate that SPP-net boosts the accuracy of a variety of published CNN architectures despite their different designs. On the Pascal VOC 2007 and Caltech101 datasets, SPP-net achieves state-of-the-art classification results using a single full-image representation and no fine-tuning. The power of SPP-net is also significant in object detection. Using SPP-net, we compute the feature maps from the entire image only once, and then pool features in arbitrary regions (sub-images) to generate fixed-length representations for training the detectors. This method avoids repeatedly computing the convolutional features. In processing test images, our method computes convolutional features 30-170x faster than the recent and most accurate method R-CNN (and 24-64x faster overall), while achieving better or comparable accuracy on Pascal VOC 2007. In ImageNet Large Scale Visual Recognition Challenge (ILSVRC) 2014, our methods rank \#2 in object detection and \#3 in image classification among all 38 teams. This manuscript also introduces the improvement made for this competition.}, annote = {Comment: This manuscript (v2) is an extended technical report of our ECCV 2014 paper. This manuscript introduces the details of our methods for ILSVRC 2014 (rank \#2 in DET and \#3 in CLS)}, author = {He, Kaiming and Zhang, Xiangyu and Ren, Shaoqing and Sun, Jian}, file = {:home/perellm1/.local/share/data/Mendeley Ltd./Mendeley Desktop/Downloaded/8XXBN5QR/He et al. - 2014 - Spatial Pyramid Pooling in Deep Convolutional Netw.pdf:pdf;:home/perellm1/.local/share/data/Mendeley Ltd./Mendeley Desktop/Downloaded/IV8N3I9E/1406.html:html}, journal = {arXiv:1406.4729 [cs]}, keywords = {Computer Science - Computer Vision and Pattern Rec,mscthesis}, mendeley-tags = {Computer Science - Computer Vision and Pattern Rec,mscthesis}, month = jun, title = {{Spatial Pyramid Pooling in Deep Convolutional Networks for Visual Recognition}}, url = {http://arxiv.org/abs/1406.4729 http://www.arxiv.org/pdf/1406.4729.pdf}, year = {2014} }  @article{Hoyer2000, abstract = {Previous work has shown that independent component analysis (ICA) applied to feature extraction from natural image data yields features resembling Gabor functions and simple-cell receptive fields. This article considers the effects of including chromatic and stereo information. The inclusion of colour leads to features divided into separate red/green, blue/yellow, and bright/dark channels. Stereo image data, on the other hand, leads to binocular receptive fields which are tuned to various disparities. The similarities between these results and the observed properties of simple cells in the primary visual cortex are further evidence for the hypothesis that visual cortical neurons perform some type of redundancy reduction, which was one of the original motivations for ICA in the first place. In addition, ICA provides a principled method for feature extraction from colour and stereo images; such features could be used in image processing operations such as denoising and compression, as well as in pattern recognition.}, author = {Hoyer, P O and Hyv\"{a}rinen, A}, file = {:home/perellm1/.local/share/data/Mendeley Ltd./Mendeley Desktop/Downloaded/Hoyer, Hyv\"{a}rinen - 2000 - Independent component analysis applied to feature extraction from colour and stereo images.pdf:pdf}, isbn = {3589451327}, issn = {0954-898X}, journal = {Network (Bristol, England)}, keywords = {Algorithms,Binocular,Binocular: physiology,Color Perception,Color Perception: physiology,Depth Perception,Depth Perception: physiology,Models,Neurological,Statistical,Vision,Visual Fields,Visual Fields: physiology,color,mscthesis}, mendeley-tags = {color,mscthesis}, month = aug, number = {3}, pages = {191--210}, pmid = {11014668}, title = {{Independent component analysis applied to feature extraction from colour and stereo images.}}, url = {http://www.ncbi.nlm.nih.gov/pubmed/11014668}, volume = {11}, year = {2000} }  @article{Hyvarinen2009, address = {London}, author = {Hyv\"{a}rinen, Aapo and Hurri, Jarmo and Hoyer, Patrik O.}, doi = {10.1007/978-1-84882-491-1}, file = {:share/imagedb/perellm1/references/Hyv\"{a}rinen, Hurri, Hoyer\_2009\_Natural Image Statistics.pdf:pdf}, isbn = {978-1-84882-490-4}, keywords = {mscthesis}, mendeley-tags = {mscthesis}, publisher = {Springer London}, series = {Computational Imaging and Vision}, title = {{Natural Image Statistics}}, url = {http://www.springerlink.com/index/10.1007/978-1-84882-491-1}, volume = {39}, year = {2009} }  @article{Jarrett2009, abstract = {In many recent object recognition systems, feature extraction stages are generally composed of a filter bank, a non-linear transformation, and some sort of feature pooling layer. Most systems use only one stage of feature extraction in which the filters are hard-wired, or two stages where the filters in one or both stages are learned in supervised or unsupervised mode. This paper addresses three questions: 1. How does the non-linearities that follow the filter banks influence the recognition accuracy? 2. does learning the filter banks in an unsupervised or supervised manner improve the performance over random filters or hardwired filters? 3. Is there any advantage to using an architecture with two stages of feature extraction, rather than one? We show that using non-linearities that include rectification and local contrast normalization is the single most important ingredient for good accuracy on object recognition benchmarks. We show that two stages of feature extraction yield better accuracy than one. Most surprisingly, we show that a two-stage system with random filters can yield almost 63\% recognition rate on Caltech-101, provided that the proper non-linearities and pooling layers are used. Finally, we show that with supervised refinement, the system achieves state-of-the-art performance on NORB dataset (5.6\%) and unsupervised pre-training followed by supervised refinement produces good accuracy on Caltech-101 (> 65\%), and the lowest known error rate on the undistorted, unprocessed MNIST dataset (0.53\%).}, annote = {$\backslash$begin\{itemize\}$\backslash$item 1. Differences in non-linearities$\backslash$begin\{itemize\}$\backslash$item Rectifying non-linearity is the most important factor$\backslash$begin\{itemize\}$\backslash$item The polarization does not seem important$\backslash$item Or the possible cancelations are counterproductive$\backslash$end\{itemize\}$\backslash$end\{itemize\}$\backslash$item 2. unsupervised, supervised, random, and hardwired filters$\backslash$begin\{itemize\}$\backslash$item Hardwired filters have the worst performance$\backslash$item Random filters achieve good performance$\backslash$end\{itemize\}$\backslash$item 3. Deep vs shallow$\backslash$begin\{itemize\}$\backslash$item Two stages are better than one$\backslash$end\{itemize\}$\backslash$item Background$\backslash$begin\{itemize\}$\backslash$item Common approach steps:$\backslash$begin\{itemize\}$\backslash$item Feature extraction with some filter banks$\backslash$begin\{itemize\}$\backslash$item oriented edges$\backslash$item gabor filters$\backslash$end\{itemize\}$\backslash$item non-linear operation on the original features$\backslash$begin\{itemize\}$\backslash$item Quantization$\backslash$item winner-take-all$\backslash$item sparsification$\backslash$item normalization$\backslash$item point-wise saturation$\backslash$end\{itemize\}$\backslash$item pooling operation$\backslash$begin\{itemize\}$\backslash$item max pooling$\backslash$item average pooling$\backslash$item histogramming$\backslash$end\{itemize\}$\backslash$item Classify with supervised method$\backslash$end\{itemize\}$\backslash$item Example:$\backslash$begin\{itemize\}$\backslash$item SIFT$\backslash$begin\{itemize\}$\backslash$item apply oriented edges to some region$\backslash$item determines dominant orientation$\backslash$item agregate different regions$\backslash$end\{itemize\}$\backslash$end\{itemize\}$\backslash$item Feature extraction$\backslash$begin\{itemize\}$\backslash$item Gabor wavelets$\backslash$item SIFT$\backslash$item HoG$\backslash$item statistics of input data on natural images creates gabor-like filters$\backslash$item Random filters$\backslash$item learn the filters with gradient descent$\backslash$end\{itemize\}$\backslash$end\{itemize\}$\backslash$item Method$\backslash$begin\{itemize\}$\backslash$item Layers$\backslash$begin\{itemize\}$\backslash$item Filter Bank Layer \$F\_\{CSG\}\\backslash$begin\{itemize\}$\backslash$item Convolution filter$\backslash$item Sigmoid/tanh non-linearity$\backslash$item Gain coefficients$\backslash$end\{itemize\}$\backslash$item Rectification Layer \$R\_\{abs\}\\backslash$item Local Contrast Normalization Layer \$N\\backslash$item Average Pooling and Subscampling Layer \$P\_A\\backslash$item Max Pooling and Subscampling Layer \$P\_M\\backslash$end\{itemize\}$\backslash$item Architectures$\backslash$begin\{itemize\}$\backslash$item \$F\_\{CSG\}-P\_A\\backslash$item \$F\_\{CSG\}-R\_\{abs\}-P\_A\\backslash$item \$F\_\{CSG\}-R\_\{abs\}-N-P\_A\\backslash$item \$F\_\{CSG\}-N\\backslash$end\{itemize\}$\backslash$item Training protocols$\backslash$begin\{itemize\}$\backslash$item Random Features and Supervised Classifier - R and RR$\backslash$item Unsupervised Features, Supervised Classifier - U and UU$\backslash$item Random Features, Global Supervised Refinement - R+ and R+R+$\backslash$item Unsupervised Feature, Global Supervised Refinement - U+ and U+U+$\backslash$end\{itemize\}$\backslash$item Generation of Unsupervised filters using Predictive Sparse Decomposition$\backslash$end\{itemize\}$\backslash$item Results$\backslash$begin\{itemize\}$\backslash$item Random filters achieve good performance$\backslash$item Supervised Refinement improves$\backslash$item Two stages are better than one$\backslash$item Unsupervised pretraining achieves better results, but in case of using rectification and normalization the improvement is about \$1\backslash\%\\backslash$item Rectification is very important$\backslash$item One stage + PMK SVM gives good results$\backslash$item Using handmade Gabbor filters goot worst results than random filters$\backslash$end\{itemize\}$\backslash$end\{itemize\}}, author = {Jarrett, Kevin and Kavukcuoglu, Koray and Ranzato, Marc' Aurelio and LeCun, Yann}, doi = {10.1109/ICCV.2009.5459469}, file = {:share/imagedb/perellm1/references/Jarrett et al.\_2009\_What is the best multi-stage architecture for object recognition.pdf:pdf}, isbn = {978-1-4244-4420-5}, journal = {2009 IEEE 12th International Conference on Computer Vision}, keywords = {mscthesis}, mendeley-tags = {mscthesis}, month = sep, pages = {2146--2153}, publisher = {Ieee}, title = {{What is the best multi-stage architecture for object recognition?}}, url = {http://ieeexplore.ieee.org/lpdocs/epic03/wrapper.htm?arnumber=5459469}, year = {2009} }  @article{Lowe2004, author = {Lowe, David G.}, doi = {10.1023/B:VISI.0000029664.99615.94}, file = {:home/perellm1/.local/share/data/Mendeley Ltd./Mendeley Desktop/Downloaded/Lowe - 2004 - Distinctive Image Features from Scale-Invariant Keypoints.pdf:pdf}, issn = {0920-5691}, journal = {International Journal of Computer Vision}, keywords = {mscthesis}, mendeley-tags = {mscthesis}, month = nov, number = {2}, pages = {91--110}, title = {{Distinctive Image Features from Scale-Invariant Keypoints}}, url = {http://link.springer.com/10.1023/B:VISI.0000029664.99615.94}, volume = {60}, year = {2004} }  @book{Mehrotra1997, author = {Mehrotra, Kishan and Mohan, CK and Ranka, Sanjay}, file = {:share/imagedb/perellm1/references/Mehrotra, Mohan, Ranka\_1997\_Elements of artificial neural networks.pdf:pdf}, isbn = {0-262-13328-8}, keywords = {mscthesis}, mendeley-tags = {mscthesis}, title = {{Elements of artificial neural networks}}, url = {http://books.google.com/books?hl=en\&lr=\&id=6d68Y4Wq\_R4C\&oi=fnd\&pg=PA1\&dq=Elements+of+Artificial+Neural+Networks\&ots=6ry8W1CZC4\&sig=sJvDyqo5xQAE0sqdRijEki0nOrM}, year = {1997} }  @article{Mnih2014, abstract = {Applying convolutional neural networks to large images is computationally expensive because the amount of computation scales linearly with the number of image pixels. We present a novel recurrent neural network model that is capable of extracting information from an image or video by adaptively selecting a sequence of regions or locations and only processing the selected regions at high resolution. Like convolutional neural networks, the proposed model has a degree of translation invariance built-in, but the amount of computation it performs can be controlled independently of the input image size. While the model is non-differentiable, it can be trained using reinforcement learning methods to learn task-specific policies. We evaluate our model on several image classification tasks, where it significantly outperforms a convolutional neural network baseline on cluttered images, and on a dynamic visual control problem, where it learns to track a simple object without an explicit training signal for doing so.}, author = {Mnih, Volodymyr and Heess, Nicolas and Graves, Alex}, file = {:home/perellm1/.local/share/data/Mendeley Ltd./Mendeley Desktop/Downloaded/RAAIND6A/Mnih et al. - 2014 - Recurrent Models of Visual Attention.pdf:pdf;:home/perellm1/.local/share/data/Mendeley Ltd./Mendeley Desktop/Downloaded/URNH76JN/1406.html:html}, journal = {Advances in Neural Information \ldots}, keywords = {Computer Science - Computer Vision and Pattern Rec,Computer Science - Learning,Statistics - Machine Learning,mscthesis}, mendeley-tags = {Computer Science - Computer Vision and Pattern Rec,Computer Science - Learning,Statistics - Machine Learning,mscthesis}, month = jun, title = {{Recurrent Models of Visual Attention}}, url = {http://arxiv.org/abs/1406.6247 http://www.arxiv.org/pdf/1406.6247.pdf http://papers.nips.cc/paper/5542-recurrent-models-of-visual-attention}, year = {2014} }  @article{Phung2005, abstract = {This work presents a study of three important issues of the color pixel classification approach to skin segmentation: color representation, color quantization, and classification algorithm. Our analysis of several representative color spaces using the Bayesian classifier with the histogram technique shows that skin segmentation based on color pixel classification is largely unaffected by the choice of the color space. However, segmentation performance degrades when only chrominance channels are used in classification. Furthermore, we find that color quantization can be as low as 64 bins per channel, although higher histogram sizes give better segmentation performance. The Bayesian classifier with the histogram technique and the multilayer perceptron classifier are found to perform better compared to other tested classifiers, including three piecewise linear classifiers, three unimodal Gaussian classifiers, and a Gaussian mixture classifier.}, author = {Phung, S.L. and Bouzerdoum, A. and Chai, Sr. D.}, doi = {10.1109/TPAMI.2005.17}, file = {:home/perellm1/.local/share/data/Mendeley Ltd./Mendeley Desktop/Downloaded/VT26VD4P/Phung et al. - 2005 - Skin segmentation using color pixel classification.pdf:pdf;:home/perellm1/.local/share/data/Mendeley Ltd./Mendeley Desktop/Downloaded/DS8RERSH/abs\_all.html:html}, issn = {0162-8828}, journal = {IEEE Transactions on Pattern Analysis and Machine Intelligence}, keywords = {Algorithms,Bayes methods,Bayesian classifier,Bayesian methods,Classification algorithms,Cluster Analysis,Color,Colorimetry,Computer Graphics,Computer Simulation,Degradation,Gaussian mixture classifier,Gaussian processes,Histograms,Image Enhancement,Image Interpretation- Computer-Assisted,Index Terms- Pixel classification,Information Storage and Retrieval,Models- Biological,Models- Statistical,Neural Networks (Computer),Pattern Recognition- Automated,Quantization,Reproducibility of Results,Sensitivity and Specificity,Signal Processing- Computer-Assisted,Skin,Skin Pigmentation,Subtraction Technique,Testing,artificial intelligence,chrominance channels,classifier design and evaluation,color pixel classification algorithm,color quantization,color space,color space representation,face detection.,histogram technique,image classification,image colour analysis,image representation,image segmentation,mscthesis,multilayer perceptron classifier,multilayer perceptrons,performance degradation,performance evaluation,piecewise linear classifiers,skin segmentation,unimodal Gaussian classifiers}, mendeley-tags = {Algorithms,Bayes methods,Bayesian classifier,Bayesian methods,Classification algorithms,Cluster Analysis,Color,Colorimetry,Computer Graphics,Computer Simulation,Degradation,Gaussian mixture classifier,Gaussian processes,Histograms,Image Enhancement,Image Interpretation- Computer-Assisted,Index Terms- Pixel classification,Information Storage and Retrieval,Models- Biological,Models- Statistical,Neural Networks (Computer),Pattern Recognition- Automated,Quantization,Reproducibility of Results,Sensitivity and Specificity,Signal Processing- Computer-Assisted,Skin,Skin Pigmentation,Subtraction Technique,Testing,artificial intelligence,chrominance channels,classifier design and evaluation,color pixel classification algorithm,color quantization,color space,color space representation,face detection.,histogram technique,image classification,image colour analysis,image representation,image segmentation,mscthesis,multilayer perceptron classifier,multilayer perceptrons,performance degradation,performance evaluation,piecewise linear classifiers,skin segmentation,unimodal Gaussian classifiers}, month = jan, number = {1}, pages = {148--154}, shorttitle = {Skin segmentation using color pixel classification}, title = {{Skin segmentation using color pixel classification: analysis and comparison}}, url = {http://ieeexplore.ieee.org/ielx5/34/29817/01359760.pdf?tp=\&arnumber=1359760\&isnumber=29817 http://ieeexplore.ieee.org/xpls/abs\_all.jsp?arnumber=1359760}, volume = {27}, year = {2005} }  @article{Pollack1989, abstract = {Research efforts to study computation and cognitive modeling on neurally-inspired mechanisms have come to be called Connectionism. Rather than being brand new, it is actually the rebirth of a research programme which thrived from the 40s through the 60s and then was severely retrenched in the 70s. Connectionism is often posed as a paradigmatic competitor to the Symbolic Processing tradition of Artificial Intelligence (Dreyfus \& Dreyfus, 1988), and, indeed, the counterpoint in the timing of their intellectual and commercial fortunes may lead one to believe that research in cognition is merely a zero-sum game. This paper surveys the history of the field, often in relation to AI, discusses its current successes and failures, and makes some predictions for where it might lead in the future.}, author = {Pollack, JB}, file = {:share/imagedb/perellm1/references/Pollack\_1989\_Connectionism Past, present, and future.pdf:pdf}, journal = {Artificial Intelligence Review}, keywords = {mscthesis}, mendeley-tags = {mscthesis}, pages = {1--14}, title = {{Connectionism: Past, present, and future}}, url = {http://link.springer.com/article/10.1007/BF00139193}, year = {1989} }  @article{Safadi2013, abstract = {The Quaero group is a consortium of French and German organizations working on Multimedia Indexing and Retrieval. LIG, INRIA and KIT participated to the semantic indexing task and LIG participated to the organization of this task. This paper describes these participations. For the semantic indexing task, our approach uses a six-stages processing pipelines for computing scores for the likelihood of a video shot to contain a target concept. These scores are then used for producing a ranked list of images or shots that are the most likely to contain the target concept. The pipeline is composed of the following steps: descriptor extraction, descriptor optimization, classi cation, fusion of descriptor variants, higher-level fusion, and re-ranking. We used a number of di erent descriptors and a hierarchical fusion strategy. We also used conceptual feedback by adding a vector of classi cation score to the pool of descriptors. The best Quaero run has a Mean Inferred Average Precision of 0.2692, which ranked us 3rd out of 16 participants. We also organized the TRECVid SIN 2012 collaborative annotation.}, author = {Safadi, Bahjat and Derbas, Nadia and Hamadi, Abdelkader and Vuong, Thi-thu-thuy and Dong, Han and Mulhem, Philippe and Qu, Georges}, file = {:home/perellm1/.local/share/data/Mendeley Ltd./Mendeley Desktop/Downloaded/Safadi et al. - 2013 - Quaero at TRECVid 2013 Semantic Indexing.pdf:pdf}, keywords = {mscthesis,trecvid}, mendeley-tags = {mscthesis,trecvid}, title = {{Quaero at TRECVid 2013 : Semantic Indexing}}, url = {http://hal.archives-ouvertes.fr/docs/00/77/02/40/PDF/Safadi-al\_TRECVID2012.pdf}, year = {2013} }  @article{Wilcoxon1945, abstract = {The comparison of two treatments generally falls into one of the following two categories: (a) we may have a number of replications for each of the two treatments, which are unpaired, or (b) we may have a number of paired comparisons leading to a series of differences, some of which may be positive and some negative. The appropriate methods for testing the significance of the differences of the means in these two cases are described in most of the textbooks on statistical methods.}, author = {Wilcoxon, Frank}, doi = {10.2307/3001968}, editor = {Kotz, Samuel and Johnson, Norman L.}, file = {:home/perellm1/.local/share/data/Mendeley Ltd./Mendeley Desktop/Downloaded/SG7JQEGJ/978-1-4612-4380-9\_16.html:html}, isbn = {978-0-387-94039-7, 978-1-4612-4380-9}, issn = {0099-4987}, journal = {Biometrics bulletin}, keywords = {,Statistics- general}, language = {en}, mendeley-tags = {Statistics- general}, month = dec, number = {6}, pages = {80--83}, publisher = {Springer New York}, series = {Springer Series in Statistics}, title = {{Individual comparisons by ranking methods}}, url = {http://www.jstor.org/stable/3001968 http://link.springer.com/chapter/10.1007/978-1-4612-4380-9\_16}, volume = {1}, year = {1945} }  @incollection{Yosinski2014, abstract = {Many deep neural networks trained on natural images exhibit a curious phenomenon in common: on the first layer they learn features similar to Gabor filters and color blobs. Such first-layer features appear not to be specific to a particular dataset or task, but general in that they are applicable to many datasets and tasks. Features must eventually transition from general to specific by the last layer of the network, but this transition has not been studied extensively. In this paper we experimentally quantify the generality versus specificity of neurons in each layer of a deep convolutional neural network and report a few surprising results. Transferability is negatively affected by two distinct issues: (1) the specialization of higher layer neurons to their original task at the expense of performance on the target task, which was expected, and (2) optimization difficulties related to splitting networks between co-adapted neurons, which was not expected. In an example network trained on ImageNet, we demonstrate that either of these two issues may dominate, depending on whether features are transferred from the bottom, middle, or top of the network. We also document that the transferability of features decreases as the distance between the base task and target task increases, but that transferring features even from distant tasks can be better than using random features. A final surprising result is that initializing a network with transferred features from almost any number of layers can produce a boost to generalization that lingers even after fine-tuning to the target dataset.}, author = {Yosinski, Jason and Clune, Jeff and Bengio, Yoshua and Lipson, Hod}, editor = {Ghahramani, Z. and Welling, M. and Cortes, C. and Lawrence, N. D. and Weinberger, K. Q.}, file = {:home/perellm1/.local/share/data/Mendeley Ltd./Mendeley Desktop/Downloaded/E37U2Z4U/Yosinski et al. - 2014 - How transferable are features in deep neural netwo.pdf:pdf;:home/perellm1/.local/share/data/Mendeley Ltd./Mendeley Desktop/Downloaded/F64VH8SJ/5347-how-transferable-are-features-in-deep-neural-networks.html:html}, keywords = {mscthesis}, mendeley-tags = {mscthesis}, pages = {3320--3328}, publisher = {Curran Associates, Inc.}, title = {{How transferable are features in deep neural networks?}}, url = {http://papers.nips.cc/paper/5347-how-transferable-are-features-in-deep-neural-networks.pdf http://papers.nips.cc/paper/5347-how-transferable-are-features-in-deep-neural-networks}, year = {2014} }  @article{Agrawal2014, abstract = {In the last two years, convolutional neural networks (CNNs) have achieved an impressive suite of results on standard recognition datasets and tasks. CNN-based features seem poised to quickly replace engineered representations, such as SIFT and HOG. However, compared to SIFT and HOG, we understand much less about the nature of the features learned by large CNNs. In this paper, we experimentally probe several aspects of CNN feature learning in an attempt to help practitioners gain useful, evidence-backed intuitions about how to apply CNNs to computer vision problems.}, annote = {$\backslash$begin\{itemize\}$\backslash$item Analysis of CNN (Alexnet)$\backslash$item Findings$\backslash$begin\{itemize\}$\backslash$item Effects of fine-tuning and pre-training:$\backslash$begin\{itemize\}$\backslash$item Supervised pre-training is beneficial$\backslash$item Fine-tuning seems more significant for fc6 and fc7$\backslash$end\{itemize\}$\backslash$item ImageNet Pre-training does not Overfit:$\backslash$begin\{itemize\}$\backslash$item pre-training time increases performance, and seems to not increase generalization error$\backslash$item For generalization quite quick 15k - 50k iterations (80$\backslash$\%-90$\backslash$\% of final performance)$\backslash$end\{itemize\}$\backslash$item Grandmother cells and distributed codes:$\backslash$begin\{itemize\}$\backslash$item there are some grandmother cells'' for bicycle, person, cars and cats (from 15 to 30 filters)$\backslash$item but most of the features are distributed (from 30 to 40 filters)$\backslash$end\{itemize\}$\backslash$item Importance of feature location and magnitude:$\backslash$begin\{itemize\}$\backslash$item CNN encoding:$\backslash$begin\{itemize\}$\backslash$item Filters with non-zero response$\backslash$item Magnitude of the response$\backslash$item Spatial layout$\backslash$end\{itemize\}$\backslash$item spatial location critical for detection, but not for classification$\backslash$item Binarization gives similar results on fc6 and fc7 but not in early conv layers$\backslash$item Loosing spatial information drops performance on detection$\backslash$end\{itemize\}$\backslash$end\{itemize\}$\backslash$item Datasets$\backslash$begin\{itemize\}$\backslash$item PASCAL VOC 2007$\backslash$item SUN dataset$\backslash$item ImageNet (pretraining)$\backslash$end\{itemize\}$\backslash$end\{itemize\}}, author = {Agrawal, Pulkit and Girshick, Ross and Malik, Jitendra}, editor = {Fleet, David and Pajdla, Tomas and Schiele, Bernt and Tuytelaars, Tinne}, file = {:home/perellm1/.local/share/data/Mendeley Ltd./Mendeley Desktop/Downloaded/XGAKN7ZE/Agrawal et al. - 2014 - Analyzing the Performance of Multilayer Neural Net.pdf:pdf;:home/perellm1/.local/share/data/Mendeley Ltd./Mendeley Desktop/Downloaded/Z69RS97B/978-3-319-10584-0\_22.html:html}, isbn = {978-3-319-10583-3, 978-3-319-10584-0}, journal = {Computer Vision-ECCV 2014}, keywords = {Artificial Intelligence (incl. Robotics),Computer Graphics,Image Processing and Computer Vision,Pattern Recognition,convolutional neural networks,empirical analysis,mscthesis,object recognition}, language = {en}, mendeley-tags = {Artificial Intelligence (incl. Robotics),Computer Graphics,Image Processing and Computer Vision,Pattern Recognition,convolutional neural networks,empirical analysis,mscthesis,object recognition}, month = jan, pages = {329--344}, publisher = {Springer International Publishing}, series = {Lecture Notes in Computer Science}, title = {{Analyzing the performance of multilayer neural networks for object recognition}}, url = {http://link.springer.com/chapter/10.1007/978-3-319-10584-0\_22 http://link.springer.com/content/pdf/10.1007/978-3-319-10584-0\_22.pdf}, year = {2014} }  @article{Callet2006, abstract = {This paper describes an application of neural networks in the field of objective measurement method designed to automatically assess the perceived quality of digital videos. This challenging issue aims to emulate human judgment and to replace very complex and time consuming subjective quality assessment. Several metrics have been proposed in literature to tackle this issue. They are based on a general framework that combines different stages, each of them addressing complex problems. The ambition of this paper is not to present a global perfect quality metric but rather to focus on an original way to use neural networks in such a framework in the context of reduced reference (RR) quality metric. Especially, we point out the interest of such a tool for combining features and pooling them in order to compute quality scores. The proposed approach solves some problems inherent to objective metrics that should predict subjective quality score obtained using the single stimulus continuous quality evaluation (SSCQE) method. This latter has been adopted by video quality expert group (VQEG) in its recently finalized reduced referenced and no reference (RRNR-TV) test plan. The originality of such approach compared to previous attempts to use neural networks for quality assessment, relies on the use of a convolutional neural network (CNN) that allows a continuous time scoring of the video. Objective features are extracted on a frame-by-frame basis on both the reference and the distorted sequences; they are derived from a perceptual-based representation and integrated along the temporal axis using a time-delay neural network (TDNN). Experiments conducted on different MPEG-2 videos, with bit rates ranging 2-6 Mb/s, show the effectiveness of the proposed approach to get a plausible model of temporal pooling from the human vision system (HVS) point of view. More specifically, a linear correlation criteria, between objective and subjective scoring, up to 0.92 has been obtained on a- - set of typical TV videos}, author = {Callet, P Le}, file = {:home/perellm1/.local/share/data/Mendeley Ltd./Mendeley Desktop/Downloaded/Callet - 2006 - A convolutional neural network approach for objective video quality assessment.pdf:pdf}, journal = {Neural Networks, IEEE \ldots}, keywords = {CNN,mscthesis}, mendeley-tags = {CNN,mscthesis}, pages = {1316--1327}, title = {{A convolutional neural network approach for objective video quality assessment}}, url = {http://medcontent.metapress.com/index/A65RM03P4874243N.pdf http://ieeexplore.ieee.org/xpls/abs\_all.jsp?arnumber=1687939 http://hal.univ-nantes.fr/docs/00/28/74/26/PDF/A\_convolutional\_neural\_network\_approach\_for\_objective\_video\_quality\_assessment\_completefinal\_manuscript.pdf}, volume = {5}, year = {2006} }  @article{Lee2009a, abstract = {In recent years, deep learning approaches have gained significant interest as a way of building hierarchical representations from unlabeled data. However, to our knowledge, these deep learning approaches have not been extensively studied for auditory data. In this paper, we apply convolutional deep belief networks to audio data and empirically evaluate them on various audio classification tasks. In the case of speech data, we show that the learned features correspond to phones/phonemes. In addition, our feature representations learned from unlabeled audio data show very good performance for multiple audio classification tasks. We hope that this paper will inspire more research on deep learning approaches applied to a wide range of audio recognition tasks.}, author = {Lee, Honglak and Pham, PT and Largman, Y and Ng, AY}, file = {:home/perellm1/.local/share/data/Mendeley Ltd./Mendeley Desktop/Downloaded/Lee et al. - 2009 - Unsupervised feature learning for audio classification using convolutional deep belief networks.pdf:pdf}, journal = {NIPS}, keywords = {mscthesis}, mendeley-tags = {mscthesis}, pages = {1--9}, title = {{Unsupervised feature learning for audio classification using convolutional deep belief networks.}}, url = {https://papers.nips.cc/paper/3674-unsupervised-feature-learning-for-audio-classification-using-convolutional-deep-belief-networks.pdf}, year = {2009} }  @article{Everingham2014, abstract = {The Pascal Visual Object Classes (VOC) challenge consists of two components: (i) a publicly available dataset of images together with ground truth annotation and standardised evaluation software; and (ii) an annual competition and workshop. There are five challenges: classification, detection, segmentation, action classification, and person layout. In this paper we provide a review of the challenge from 2008–2012. The paper is intended for two audiences: algorithm designers, researchers who want to see what the state of the art is, as measured by performance on the VOC datasets, along with the limitations and weak points of the current generation of algorithms; and, challenge designers, who want to see what we as organisers have learnt from the process and our recommendations for the organisation of future challenges. To analyse the performance of submitted algorithms on the VOC datasets we introduce a number of novel evaluation methods: a bootstrapping method for determining whether differences in the performance of two algorithms are significant or not; a normalised average precision so that performance can be compared across classes with different proportions of positive instances; a clustering method for visualising the performance across multiple algorithms so that the hard and easy images can be identified; and the use of a joint classifier over the submitted algorithms in order to measure their complementarity and combined performance. We also analyse the community’s progress through time using the methods of Hoiem et al. (Proceedings of European Conference on Computer Vision, 2012) to identify the types of occurring errors. We conclude the paper with an appraisal of the aspects of the challenge that worked well, and those that could be improved in future challenges.}, author = {Everingham, Mark and Eslami, SMA}, doi = {10.1007/s11263-014-0733-5}, file = {:home/perellm1/.local/share/data/Mendeley Ltd./Mendeley Desktop/Downloaded/AW5E2FDN/Everingham et al. - 2014 - The Pascal Visual Object Classes Challenge A Retr.pdf:pdf;:home/perellm1/.local/share/data/Mendeley Ltd./Mendeley Desktop/Downloaded/B8IZHA92/s11263-014-0733-5.html:html}, issn = {0920-5691, 1573-1405}, journal = {International Journal of \ldots}, keywords = {Artificial Intelligence (incl. Robotics),Benchmark,Computer Imaging- Vision- Pattern Recognition and Graphics,Database,Image Processing and Computer Vision,Object detection,Pattern Recognition,object recognition,segmentation}, language = {en}, mendeley-tags = {Artificial Intelligence (incl. Robotics),Benchmark,Computer Imaging- Vision- Pattern Recognition and Graphics,Database,Image Processing and Computer Vision,Object detection,Pattern Recognition,object recognition,segmentation}, month = jun, number = {1}, pages = {98--136}, shorttitle = {The Pascal Visual Object Classes Challenge}, title = {{The pascal visual object classes challenge: A retrospective}}, url = {http://link.springer.com/article/10.1007/s11263-014-0733-5 http://link.springer.com/content/pdf/10.1007/s11263-014-0733-5.pdf}, volume = {111}, year = {2014} }  @inproceedings{Bruce2000, abstract = {Vision systems employing region segmentation by color are crucial in real-time mobile robot applications. With careful attention to algorithm efficiency, fast color image segmentation can be accomplished using commodity image capture and CPU hardware. This paper describes a system capable of tracking several hundred regions of up to 32 colors at 30 Hz on general purpose commodity hardware. The software system consists of: a novel implementation of a threshold classifier, a merging system to form regions through connected components, a separation and sorting system that gathers various region features, and a top down merging heuristic to approximate perceptual grouping. A key to the efficiency of our approach is a new method for accomplishing color space thresholding that enables a pixel to be classified into one or more, up to 32 colors, using only two logical AND operations. The algorithms and representations are described, as well as descriptions of three applications in which it has been used}, author = {Bruce, J. and Balch, T. and Veloso, M.}, doi = {10.1109/IROS.2000.895274}, file = {:home/perellm1/.local/share/data/Mendeley Ltd./Mendeley Desktop/Downloaded/KHG6ACZ4/Bruce et al. - 2000 - Fast and inexpensive color image segmentation for .pdf:pdf;:home/perellm1/.local/share/data/Mendeley Ltd./Mendeley Desktop/Downloaded/SD66K2PI/abs\_all.html:html}, keywords = {Application software,Color,Hardware,Machine vision,Merging,Real time systems,Software systems,Sorting,color image segmentation,color space thresholding,heuristic,image colour analysis,image segmentation,mobile robot,mobile robots,mscthesis,optical tracking,perceptual grouping,robot vision,threshold classifier,tracking}, mendeley-tags = {Application software,Color,Hardware,Machine vision,Merging,Real time systems,Software systems,Sorting,color image segmentation,color space thresholding,heuristic,image colour analysis,image segmentation,mobile robot,mobile robots,mscthesis,optical tracking,perceptual grouping,robot vision,threshold classifier,tracking}, pages = {2061--2066 vol.3}, title = {{Fast and inexpensive color image segmentation for interactive robots}}, url = {http://ieeexplore.ieee.org/ielx5/7177/19356/00895274.pdf?tp=\&arnumber=895274\&isnumber=19356 http://ieeexplore.ieee.org/xpls/abs\_all.jsp?arnumber=895274}, volume = {3}, year = {2000} }  @article{Carreira-Perpinan2005, abstract = {Maximum-likelihood(ML) learning of Markov random fields is challenging because it requires estimates of averages that have anexponential number of terms. Markov chain Monte Carlo methods typically take a long time to converge on unbiased estimates, but Hinton (2002) showed that if the Markovchain is only run for a few steps, the learning can still work well and it approximately minimizes a different function called “contrastive divergence”(CD). CD learning has been successfully applied to various types of random fields. Here, we study the properties of CD learning and show that it provides biased estimates in general, but that the bias is typically very small. Fast CD learning can therefore be used to get close to an ML solution and slow ML learning can then be used to fine-tune the CD solution.}, annote = {cited: 193 (01/06/2014)}, author = {Carreira-Perpinan, MA and Hinton, Geoffrey}, file = {:home/perellm1/.local/share/data/Mendeley Ltd./Mendeley Desktop/Downloaded/Carreira-Perpinan, Hinton - 2005 - On contrastive divergence learning.pdf:pdf}, journal = {\ldots on artificial intelligence and \ldots}, keywords = {mscthesis}, mendeley-tags = {mscthesis}, title = {{On contrastive divergence learning}}, url = {http://core.kmi.open.ac.uk/download/pdf/22017.pdf\#page=42}, volume = {0}, year = {2005} }  @article{Copeland1996, abstract = {It is not widely realised that Turing was probably the first person to consider building computing machines out of simple, neuron-like elements connected together into networks in a largely random manner. Turing called his networks ‘unorganised machines’. By the application of what he described as ‘appropriate interference, mimicking education’ an unorganised machine can be trained to perform any task that a Turing machine can carry out, provided the number of ‘neurons’ is sufficient. Turing proposed simulating both the behaviour of the network and the training process by means of a computer program. We outline Turing's connectionist project of 1948.}, author = {Copeland, BJ and Proudfoot, Diane}, doi = {10.1007/BF00413694}, file = {:share/imagedb/perellm1/references/Copeland, Proudfoot\_1996\_On Alan Turing's anticipation of connectionism.pdf:pdf}, issn = {0039-7857, 1573-0964}, journal = {Synthese}, keywords = {Epistemology,Logic,Metaphysics,Philosophy,Philosophy of Language,mscthesis}, language = {en}, mendeley-tags = {Epistemology,Logic,Metaphysics,Philosophy,Philosophy of Language,mscthesis}, month = sep, number = {3}, pages = {361--377}, title = {{On Alan Turing's anticipation of connectionism}}, url = {http://link.springer.com/article/10.1007/BF00413694}, volume = {108}, year = {1996} }  @book{Fitzgibbon2012, author = {Fitzgibbon, Andrew and Lazebnik, Svetlana and Perona, Pietro and Sato, Yoichi and Schmid, Cordelia}, file = {:share/imagedb/perellm1/references/Fitzgibbon et al.\_2012\_Computer Vision - ECCV 2012.pdf:pdf}, isbn = {9783642337642}, keywords = {mscthesis}, mendeley-tags = {mscthesis}, title = {{Computer Vision - ECCV 2012}}, year = {2012} }  @article{Gabor1954, abstract = {In the case of band-limited signals, the sampling theorem permits us to replace analytic operations with algebraic operations. We are then able to discuss problems of measurement of information, coding, and transmission over noisy channels in terms of discrete samples, rather than continuous time functions. The design of the optimum linear filter reduces from a very difficult analysis problem involving spec- trum factorizatlon to a straightforward problem of solving a set of simultaneous linear equations. Unless we are interested in the most economical implementation, it is not even necessary to solve the equations. since a synthesis procedure involving only simple functions of the correlation functions is available. When extended to the general nonlinear case, the design is still specified by a set of simultaneous algebraic equations, but the labor of solution grows very rapidly. It is proposed to short circuit this labor by building a learning filter which in effect designs itself. A training period in which the adjust- ments are automatically optimized precedes the use period. By modi- fying the training program, it is possible that the filter could be taught to recognize specific signals, including, perhaps, certain speech sounds.}, author = {Gabor, D.}, doi = {10.1109/TCT.1954.1083594}, file = {:home/perellm1/.local/share/data/Mendeley Ltd./Mendeley Desktop/Downloaded/N4JUPVCP/Gabor - 1954 - Communication Theory and Cybernetics.pdf:pdf;:home/perellm1/.local/share/data/Mendeley Ltd./Mendeley Desktop/Downloaded/9B8GK4XI/abs\_all.html:html}, issn = {0197-6389}, journal = {Circuit Theory, Transactions of the IRE Professional \ldots}, keywords = {Acoustic noise,Buildings,Circuit synthesis,Cybernetics,Nonlinear equations,Nonlinear filters,Sampling methods,Signal analysis,Time measurement,mscthesis,speech recognition}, mendeley-tags = {Acoustic noise,Buildings,Circuit synthesis,Cybernetics,Nonlinear equations,Nonlinear filters,Sampling methods,Signal analysis,Time measurement,mscthesis,speech recognition}, month = dec, number = {4}, pages = {19--31}, title = {{Communication theory and cybernetics}}, url = {http://ieeexplore.ieee.org/ielx5/4811122/23424/01083594.pdf?tp=\&arnumber=1083594\&isnumber=23424 http://ieeexplore.ieee.org/xpls/abs\_all.jsp?arnumber=1083594\&tag=1 http://ieeexplore.ieee.org/xpls/abs\_all.jsp?arnumber=1083594}, volume = {CT-1}, year = {1954} }  @article{Hubel1962, abstract = {What chiefly distinguishes cerebral cortex from other parts of the central nervous system is the great diversity of its cell types and interconnexions. It would be astonishing if such a structure did not profoundly modify the response patterns of fibres coming into it. In the cat's visual cortex, the receptive field arrangements of single cells suggest that there is indeed a degree of complexity far exceeding anything yet seen at lower levels in the visual system. In a previous paper we described receptive fields of single cortical cells, observing responses to spots of light shone on one or both retinas (Hubel$\backslash$\& Wiesel, 1959). In the present work this method is used to examine receptive fields of a more complex type (Part I) and to make additional observations on binocular interaction (Part II). This approach is necessary in order to understand the behaviour of individual cells, but it fails to deal with the problem of the relationship of one cell to its neighbours. In the past, the technique of recording evoked slow waves has been used with great success in studies of individual cells, but it fails to deal with the problem of the relationship of one cell to its neighbours. In the past, the technique of recording evoked slow waves has been used with great success in studies of functional anatomy. It was employed by Talbot$\backslash$\& Marshall (1941) and by Thompson, Woolsey$\backslash$\& Talbot (1950) for mapping out the visual cortex in the rabbit, cat, and monkey. Daniel$\backslash$\& Whitteiidge (1959) have recently extended this work in the primate. Most of our present knowledge of retinotopic projections, binocular overlap, and the second visual area is based on these investigations. Yet the method of evoked potentials is valuable mainly for detecting behaviour common to large populations of neighbouring cells; it cannot differentiate functionally between areas of cortex smaller than about 1 mm 2. To overcome this difficulty a method has in recent years been developed for studying cells separately or in small groups during long micro-electrode penetrations through nervous tissue. Responses are correlated with cell location by reconstructing the electrode tracks from histological material. These techniques have been applied to the somatic sensory cortex of the cat and monkey in a remarkable series of studies by Mountcastle (1957) and Powell$\backslash$\& Mountcastle (1959). Their results show that the approach is a powerful one, capable of revealing systems of organization not hinted at by the known morphology. In Part III of the present paper we use this method in studying the functional architecture of the visual cortex. It helped us attempt to explain on anatomical grounds how cortical receptive fields are built up.}, author = {Hubel, DH and Wiesel, TN}, file = {:home/perellm1/.local/share/data/Mendeley Ltd./Mendeley Desktop/Downloaded/Hubel, Wiesel - 1962 - Receptive fields, binocular interaction and functional architecture in the cat's visual cortex.pdf:pdf}, journal = {The Journal of physiology}, keywords = {mscthesis}, mendeley-tags = {mscthesis}, pages = {106--154}, title = {{Receptive fields, binocular interaction and functional architecture in the cat's visual cortex}}, url = {http://www.ncbi.nlm.nih.gov/pmc/articles/PMC1359523/pdf/jphysiol01247-0121.pdf}, year = {1962} }  @article{Kirkpatrick1983, author = {Kirkpatrick, S. and Gelatt, C. D. and Vecchi, M. P.}, file = {:share/imagedb/perellm1/references/Kirkpatrick, Gelatt, Vecchi\_1983\_Optimization by Simulated Annealing.pdf:pdf}, journal = {Science}, keywords = {mscthesis}, mendeley-tags = {mscthesis}, number = {4598}, pages = {671--680}, publisher = {Science, New Series, Vol. 220}, title = {{Optimization by Simulated Annealing}}, volume = {220}, year = {1983} }  @article{Lee2009, abstract = {There has been much interest in unsupervised learning of hierarchical generative models such as deep belief networks. Scaling such models to full-sized, high-dimensional images remains a difficult problem. To address this problem, we present the convolutional deep belief network, a hierarchical generative model which scales to realistic image sizes. This model is translation-invariant and supports efficient bottom-up and top-down probabilistic inference. Key to our approach is probabilistic max-pooling, a novel technique which shrinks the representations of higher layers in a probabilistically sound way. Our experiments show that the algorithm learns useful high-level visual features, such as object parts, from unlabeled images of objects and natural scenes. We demonstrate excellent performance on several visual recognition tasks and show that our model can perform hierarchical (bottom-up and top-down) inference over full-sized images.}, address = {New York, New York, USA}, annote = {$\backslash$begin\{itemize\}$\backslash$item Probabilistic max-pooling$\backslash$item Scale DBN to real-sized images$\backslash$begin\{itemize\}$\backslash$item Computationally intractable$\backslash$item Need invariance in representation$\backslash$end\{itemize\}$\backslash$item RBM$\backslash$begin\{itemize\}$\backslash$item Binary valued: Independent Bernoulli random variables$\backslash$item Real valued: Gaussian with diagonal covariance$\backslash$item Training:$\backslash$begin\{itemize\}$\backslash$item Stochastic gradient ascent on log-likelihood of training data$\backslash$item Contrastive divergence approximation$\backslash$end\{itemize\}$\backslash$end\{itemize\}$\backslash$item Convolutional RBM$\backslash$begin\{itemize\}$\backslash$item detection layers: convolving feature maps$\backslash$item pooling layers: shrink the representation$\backslash$begin\{itemize\}$\backslash$item Block: CxC from bottom layer$\backslash$item Max-pooling : minimizes energy subject to only one unit can be active.$\backslash$end\{itemize\}$\backslash$item Sparsity regularization: hidden units have a mean activation close to a small constant$\backslash$end\{itemize\}$\backslash$item Convolutional Deep belief network$\backslash$begin\{itemize\}$\backslash$item Stacking CRBM on top of one another$\backslash$item Training:$\backslash$begin\{itemize\}$\backslash$item Gibbs sampling$\backslash$item Mean-field (5 iterations in this paper)$\backslash$end\{itemize\}$\backslash$end\{itemize\}$\backslash$end\{itemize\} }, author = {Lee, Honglak and Grosse, Roger and Ranganath, Rajesh and Ng, Andrew Y.}, doi = {10.1145/1553374.1553453}, file = {:home/perellm1/.local/share/data/Mendeley Ltd./Mendeley Desktop/Downloaded/Lee et al. - 2009 - Convolutional deep belief networks for scalable unsupervised learning of hierarchical representations.pdf:pdf}, isbn = {9781605585161}, journal = {Proceedings of the 26th Annual International Conference on Machine Learning - ICML '09}, keywords = {CNN,mscthesis,trecvid}, mendeley-tags = {CNN,mscthesis,trecvid}, pages = {1--8}, publisher = {ACM Press}, title = {{Convolutional deep belief networks for scalable unsupervised learning of hierarchical representations}}, url = {http://portal.acm.org/citation.cfm?doid=1553374.1553453 http://people.csail.mit.edu/rgrosse/icml09-cdbn.pdf}, year = {2009} }  @book{Liu2011, abstract = {Online learning from a signal processing perspectiveThere is increased interest in kernel learning algorithms in neural networks and a growing need for nonlinear adaptive algorithms in advanced signal processing, communications, and controls. "Kernel Adaptive Filtering" is the first book to present a comprehensive, unifying introduction to online learning algorithms in reproducing kernel Hilbert spaces. Based on research being conducted in the Computational Neuro-Engineering Laboratory at the University of Florida and in the Cognitive Systems Laboratory at McMaster University, Ontario, Canada, this unique resource elevates the adaptive filtering theory to a new level, presenting a new design methodology of nonlinear adaptive filters.Covers the kernel least mean squares algorithm, kernel affine projection algorithms, the kernel recursive least squares algorithm, the theory of Gaussian process regression, and the extended kernel recursive least squares algorithmPresents a powerful model-selection method called maximum marginal likelihoodAddresses the principal bottleneck of kernel adaptive filters--their growing structureFeatures twelve computer-oriented experiments to reinforce the concepts, with MATLAB codes downloadable from the authors' Web siteConcludes each chapter with a summary of the state of the art and potential future directions for original research"Kernel Adaptive Filtering" is ideal for engineers, computer scientists, and graduate students interested in nonlinear adaptive systems for online applications (applications where the data stream arrives one sample at a time and incremental optimal solutions are desirable). It is also a useful guide for those who look for nonlinear adaptive filtering methodologies to solve practical problems.}, author = {Liu, Weifeng and Principe, Jos\'{e} C. and Haykin, Simon}, file = {:share/imagedb/perellm1/references/Liu, Principe, Haykin\_2011\_Kernel Adaptive Filtering A Comprehensive Introduction.pdf:pdf}, isbn = {9781118211212}, keywords = {Science / Waves \& Wave Mechanics,Technology \& Engineering / Electrical,mscthesis}, language = {en}, mendeley-tags = {Science / Waves \& Wave Mechanics,Technology \& Engineering / Electrical,mscthesis}, month = sep, pages = {210}, publisher = {John Wiley \& Sons}, shorttitle = {Kernel Adaptive Filtering}, title = {{Kernel Adaptive Filtering: A Comprehensive Introduction}}, url = {http://books.google.fi/books?id=eWUwB\_P5pW0C}, year = {2011} }  @article{Masci2011, abstract = {We present a novel convolutional auto-encoder (CAE) for unsupervised feature learning. A stack of CAEs forms a convolutional neural network (CNN). Each CAE is trained using conventional on-line gradient descent without additional regularization terms. A max-pooling layer is essential to learn biologically plausible features consistent with those found by previous approaches. Initializing a CNN with filters of a trained CAE stack yields superior performance on a digit (MNIST) and an object recognition (CIFAR10) benchmark.}, author = {Masci, Jonathan and Meier, Ueli and Ciresan, D and Schmidhuber, J}, file = {:home/perellm1/.local/share/data/Mendeley Ltd./Mendeley Desktop/Downloaded/Masci et al. - 2011 - Stacked convolutional auto-encoders for hierarchical feature extraction.pdf:pdf}, journal = {Artificial Neural Networks \ldots}, keywords = {CNN,auto-encoder,classification,convolutional neural network,learning,mscthesis,trecvid,unsupervised}, mendeley-tags = {CNN,mscthesis,trecvid}, pages = {52--59}, title = {{Stacked convolutional auto-encoders for hierarchical feature extraction}}, url = {http://link.springer.com/chapter/10.1007/978-3-642-21735-7\_7 http://www.idsia.ch/~masci/papers/2011\_icann.pdf}, year = {2011} }  @phdthesis{Minsky1954, author = {Minsky, ML}, keywords = {mscthesis}, language = {en}, mendeley-tags = {mscthesis}, pages = {562}, title = {{Theory of neural-analog reinforcement systems and its application to the brain model problem}}, url = {http://books.google.fi/books?id=R3NHmgEACAAJ http://scholar.google.com/scholar?hl=en\&btnG=Search\&q=intitle:Theory+of+Neural-analog+Reinforcement+Systems+and+Its+Application+to+the+Brain-model+Problem\#0}, year = {1954} }  @article{Norouzi2009, abstract = {In this paper we present a method for learning class-specific features for recognition. Recently a greedy layer-wise procedure was proposed to initialize weights of deep belief networks, by viewing each layer as a separate restricted Boltzmann machine (RBM). We develop the convolutional RBM (C-RBM), a variant of the RBM model in which weights are shared to respect the spatial structure of images. This framework learns a set of features that can generate the images of a specific object class. Our feature extraction model is a four layer hierarchy of alternating filtering and maximum subsampling. We learn feature parameters of the first and third layers viewing them as separate C-RBMs. The outputs of our feature extraction hierarchy are then fed as input to a discriminative classifier. It is experimentally demonstrated that the extracted features are effective for object detection, using them to obtain performance comparable to the state of the art on handwritten digit recognition and pedestrian detection.}, annote = {$\backslash$begin\{itemize\}$\backslash$item New Convolutional Restricted Boltzmann Machine (C-RBM)$\backslash$item Comparable state-of-the-art on handwritten digit recognition and pedestrian detection$\backslash$item RBM$\backslash$begin\{itemize\}$\backslash$item Probabilistic model$\backslash$item hidden variables independent given observerd data$\backslash$item Not capture explicitly spacial structure of images$\backslash$end\{itemize\}$\backslash$item C-RBM$\backslash$begin\{itemize\}$\backslash$item Include spatial locality and weight sharing$\backslash$item Favors filters with high response on training images$\backslash$item Unsupervised learning using Contrastive Divergence$\backslash$item Layerwise training for stacks of RBMs$\backslash$item Convolutional connections are employed in a generative Markov Random Field architecture$\backslash$item Hidden units divided into K feature maps$\backslash$item Convolution problems$\backslash$begin\{itemize\}$\backslash$item Boundary units are withinb a smaller number of subwindows compared to the interior pixels$\backslash$item middle pixels may contribute to \$K\_\{xy\}\$features$\backslash$item Separation of boundary variables (\$v\^{}b\$) from middle variables (\$v\^{}m\$)$\backslash$item Problems sampling from boundary pixels (not have nough features)$\backslash$item Over completeness because of K-features$\backslash$item Sampling creates images very similar to the original ones$\backslash$item Need of more Gibbs sampling steps$\backslash$item Their solution is to fix hidden bias terms \$c\$during training$\backslash$end\{itemize\}$\backslash$end\{itemize\}$\backslash$item Multilayer C-RBMs$\backslash$begin\{itemize\}$\backslash$item Subsampling takes maximum conditional feature probability over non-overlapping subwindows of feature maps$\backslash$item Architecture$\backslash$begin\{itemize\}$\backslash$item discriminative layer (SVM)$\backslash$item max pooling$\backslash$item convolution$\backslash$item max pooling$\backslash$item convolution$\backslash$item input$\backslash$end\{itemize\}$\backslash$item On pedestrians also HOG is used in discriminative layer$\backslash$end\{itemize\}$\backslash$item MNIST dataset$\backslash$begin\{itemize\}$\backslash$item Discriminative layer with RBF kernel$\backslash$item 10 one-vs-rest binary SVMs$\backslash$item 1st layer 15 feature maps$\backslash$item 2nd layer 2x2 non-overlapping subwindos$\backslash$item 3rd layer 15 feature maps$\backslash$item 4th layer$\backslash$end\{itemize\}$\backslash$item Comparison with Large CNN$\backslash$begin\{itemize\}$\backslash$item C-RBM is better when training is small$\backslash$end\{itemize\}$\backslash$item Pedestrian dataset$\backslash$begin\{itemize\}$\backslash$item 1st layer 7x7 15 feature maps$\backslash$item 2nd layer 4x4 subsampling$\backslash$item 3rd layer 15x5x5 30 feature maps$\backslash$item 4th layer 2x2 subsampling$\backslash$item + HOG$\backslash$item Discriminative layer with linear kernel$\backslash$end\{itemize\}$\backslash$end\{itemize\}}, author = {Norouzi, Mohammad and Ranjbar, Mani and Mori, Greg}, file = {:home/perellm1/.local/share/data/Mendeley Ltd./Mendeley Desktop/Downloaded/Norouzi, Ranjbar, Mori - 2009 - Stacks of convolutional restricted Boltzmann machines for shift-invariant feature learning.pdf:pdf}, isbn = {9781424439911}, journal = {Computer Vision and Pattern \ldots}, keywords = {mscthesis}, mendeley-tags = {mscthesis}, pages = {2735--2742}, title = {{Stacks of convolutional restricted Boltzmann machines for shift-invariant feature learning}}, url = {http://ieeexplore.ieee.org/xpls/abs\_all.jsp?arnumber=5206577}, year = {2009} }  @article{Perez2012, abstract = {We present a method for human action recognition based on the combination of Histograms of Gradients into orientation tensors. It uses only information from HOG3D: no features or points of interest are extracted. The resulting raw histograms obtained per frame are combined into an orientation tensor, making it a simple, fast to compute and effective global descriptor. The addition of new videos and/or new action cathegories does not require any recomputation or changes to the previously computed descriptors. Our method reaches 92.01\% of recognition rate with KTH, comparable to the best local approaches. For the Hollywood2 dataset, our recognition rate is lower than local approaches but is fairly competitive, suitable when the dataset is frequently updated or the time response is a major application issue.}, author = {Perez, EA and Mota, VF and Maciel, LM and Sad, Dhiego and Vieira, Marcelo B.}, file = {:home/perellm1/.local/share/data/Mendeley Ltd./Mendeley Desktop/Downloaded/Perez et al. - 2012 - Combining gradient histograms using orientation tensors for human action recognition.pdf:pdf}, journal = {Pattern Recognition (ICPR), 2012 21st International Conference on. IEEE}, keywords = {mscthesis}, mendeley-tags = {mscthesis}, title = {{Combining gradient histograms using orientation tensors for human action recognition}}, url = {http://ieeexplore.ieee.org/xpls/abs\_all.jsp?arnumber=6460909}, year = {2012} }  @article{Saul1996, abstract = {We develop a mean field theory for sigmoid belief networks based on ideas from statistical mechanics. Our mean field theory provides a tractable approximation to the true probability distribution in these networks; it also yields a lower bound on the likelihood of evidence. We demonstrate the utility of this framework on a benchmark problem in statistical pattern recognition---the classification of handwritten digits.}, annote = {Comment: See http://www.jair.org/ for any accompanying files}, author = {Saul, L. K. and Jaakkola, T. and Jordan, M. I.}, file = {:home/perellm1/.local/share/data/Mendeley Ltd./Mendeley Desktop/Downloaded/R2UPEUBV/Saul et al. - 1996 - Mean Field Theory for Sigmoid Belief Networks.pdf:pdf;:home/perellm1/.local/share/data/Mendeley Ltd./Mendeley Desktop/Downloaded/GG4I7SZ6/9603102.html:html}, journal = {arXiv:cs/9603102}, keywords = {Computer Science - Artificial Intelligence,mscthesis}, mendeley-tags = {Computer Science - Artificial Intelligence,mscthesis}, month = feb, title = {{Mean Field Theory for Sigmoid Belief Networks}}, url = {http://arxiv.org/abs/cs/9603102 http://www.arxiv.org/pdf/cs/9603102.pdf}, year = {1996} }  @article{Snoek2013, abstract = {In this paper we summarize our TRECVID 2013 video retrieval experiments. The MediaMill team participated in four tasks: concept detection, object localization, in- stance search, and event recognition. For all tasks the starting point is our top-performing bag-of-words system of TRECVID 2008-2012, which uses color SIFT descrip- tors, average and difference coded into codebooks with spa- tial pyramids and kernel-based machine learning. New this year are concept detection with deep learning, concept detec- tion without annotations, object localization using selective search, instance search by reranking, and event recognition based on concept vocabularies. Our experiments focus on es- tablishing the video retrieval value of the innovations. The 2013 edition of the TRECVID benchmark has again been a fruitful participation for the MediaMill team, resulting in the best result for concept detection, concept detection with- out annotation, object localization, concept pair detection, and visual event recognition with few examples.}, author = {Snoek, CGM and van de Sande, KEA}, file = {:home/perellm1/.local/share/data/Mendeley Ltd./Mendeley Desktop/Downloaded/Snoek, Sande - 2013 - MediaMill at TRECVID 2013 Searching Concepts, Objects, Instances and Events in Video.pdf:pdf}, journal = {\ldots of TRECVID}, keywords = {mscthesis,trecvid}, mendeley-tags = {mscthesis,trecvid}, title = {{MediaMill at TRECVID 2013: Searching Concepts, Objects, Instances and Events in Video}}, url = {http://staff.science.uva.nl/~smeulder/publi.php?bibtex=SnoekPTRECVID2013}, year = {2013} }  @book{WilliamJames1890, abstract = {vol. 1}, author = {{William James}}, file = {:share/imagedb/perellm1/references/William James\_1890\_The principles of psychology.pdf:pdf}, keywords = {Psychology,mscthesis}, language = {eng}, mendeley-tags = {Psychology,mscthesis}, pages = {716}, publisher = {New York : Henry Holt and company}, title = {{The principles of psychology}}, url = {http://archive.org/details/theprinciplesofp01jameuoft http://scholar.google.com/scholar?hl=en\&btnG=Search\&q=intitle:The+principles+of+psychology\#0}, year = {1890} }  @article{Dayan1995, annote = {$\backslash$begin\{itemize\}$\backslash$item Hierarchical self-supervised learning$\backslash$item bottom-up recognition, top-down generation$\backslash$item general generative models have exponential number of possible explanations$\backslash$item Use an approximation of the posterior distribution Q, intead of original P$\backslash$item Kullback-Leibler divergence between Q and P$\backslash$item Maximization step optimizes generative parameters$\backslash$item Expectation step optimizes recognition distribution$\backslash$item Deterministic Helmholtz Machine$\backslash$item binary stochastic neurons$\backslash$item Top-down theta parameters implement a generative model$\backslash$item Bottom-up phi parameters implement a recognition (discriminative) model$\backslash$item Real posterior is unlikely to be factorial$\backslash$item Instead of using stochastic gradient ascent authors used wake-sleep algorithm. A mean-field inspired approximation.$\backslash$item wake phase:$\backslash$begin\{itemize\}$\backslash$item Generative weights are turned off$\backslash$item Start from the bottom with a sample d$\backslash$item Propagate the activations in upper layers with stochastic recognition units$\backslash$item Once on top$\backslash$item minimize the Kullback-Leibler divergence between the generative connections and the true underlaying real activations$\backslash$end\{itemize\}$\backslash$item Sleep phase:$\backslash$begin\{itemize\}$\backslash$item recognition weights are turned off$\backslash$item Starting from the top activate with the generative weights the lower layers until reach the input$\backslash$item The model finally generates a random instance$\backslash$item We know the real causes, therefore we can optimize the bottom-up weights of the recognition model$\backslash$end\{itemize\}$\backslash$end\{itemize\} }, author = {Dayan, Peter and Hinton, GE and Neal, RM and Zemel, RS}, file = {:share/imagedb/perellm1/references/Dayan et al.\_1995\_The helmholtz machine.pdf:pdf}, journal = {Neural computation}, pages = {889--904}, title = {{The helmholtz machine}}, url = {http://www.mitpressjournals.org/doi/abs/10.1162/neco.1995.7.5.889}, volume = {904}, year = {1995} }  @article{Everingham2010, abstract = {The Pascal Visual Object Classes (VOC) challenge is a benchmark in visual object category recognition and detection, providing the vision and machine learning communities with a standard dataset of images and annotation, and standard evaluation procedures. Organised annually from 2005 to present, the challenge and its associated dataset has become accepted as the benchmark for object detection. This paper describes the dataset and evaluation procedure. We review the state-of-the-art in evaluated methods for both classification and detection, analyse whether the methods are statistically different, what they are learning from the images (e.g. the object or its context), and what the methods find easy or confuse. The paper concludes with lessons learnt in the three year history of the challenge, and proposes directions for future improvement and extension.}, author = {Everingham, Mark and Gool, Luc Van}, doi = {10.1007/s11263-009-0275-4}, file = {:home/perellm1/.local/share/data/Mendeley Ltd./Mendeley Desktop/Downloaded/FSK57IEE/Everingham et al. - 2009 - The Pascal Visual Object Classes (VOC) Challenge.pdf:pdf;:home/perellm1/.local/share/data/Mendeley Ltd./Mendeley Desktop/Downloaded/H2DV8GJU/s11263-009-0275-4.html:html}, issn = {0920-5691, 1573-1405}, journal = {International journal of \ldots}, keywords = {Artificial Intelligence (incl. Robotics),Benchmark,Computer Imaging- Vision- Pattern Recognition and Graphics,Database,Image Processing and Computer Vision,Object detection,Pattern Recognition,object recognition}, language = {en}, mendeley-tags = {Artificial Intelligence (incl. Robotics),Benchmark,Computer Imaging- Vision- Pattern Recognition and Graphics,Database,Image Processing and Computer Vision,Object detection,Pattern Recognition,object recognition}, month = sep, number = {2}, pages = {303--338}, title = {{The pascal visual object classes (voc) challenge}}, url = {http://link.springer.com/article/10.1007/s11263-009-0275-4 http://link.springer.com/content/pdf/10.1007/s11263-009-0275-4.pdf}, volume = {88}, year = {2010} }  @article{Shotton2013, abstract = {We propose a new method to quickly and accurately predict human pose---the 3D positions of body joints---from a single depth image, without depending on information from preceding frames. Our approach is strongly rooted in current object recognition strategies. By designing an intermediate representation in terms of body parts, the difficult pose estimation problem is transformed into a simpler per-pixel classification problem, for which efficient machine learning techniques exist. By using computer graphics to synthesize a very large dataset of training image pairs, one can train a classifier that estimates body part labels from test images invariant to pose, body shape, clothing, and other irrelevances. Finally, we generate confidence-scored 3D proposals of several body joints by reprojecting the classification result and finding local modes. The system runs in under 5ms on the Xbox 360. Our evaluation shows high accuracy on both synthetic and real test sets, and investigates the effect of several training parameters. We achieve state-of-the-art accuracy in our comparison with related work and demonstrate improved generalization over exact whole-skeleton nearest neighbor matching.}, author = {Shotton, Jamie and Sharp, Toby and Kipman, Alex}, doi = {10.1145/2398356.2398381}, file = {:home/perellm1/.local/share/data/Mendeley Ltd./Mendeley Desktop/Downloaded/37UF7S8A/Shotton et al. - 2013 - Real-time Human Pose Recognition in Parts from Sin.pdf:pdf}, issn = {0001-0782}, journal = {Communications of the \ldots}, month = jan, number = {1}, pages = {116--124}, title = {{Real-time human pose recognition in parts from single depth images}}, url = {http://doi.acm.org/10.1145/2398356.2398381 http://dl.acm.org/ft\_gateway.cfm?id=2398381\&type=pdf http://dl.acm.org/citation.cfm?id=2398381}, volume = {56}, year = {2013} }  @article{Yeh2012, abstract = {We present a novel Markov chain Monte Carlo (MCMC) algorithm that generates samples from transdimensional distributions encoding complex constraints. We use factor graphs, a type of graphical model, to encode constraints as factors. Our proposed MCMC method, called locally annealed reversible jump MCMC, exploits knowledge of how dimension changes affect the structure of the factor graph. We employ a sequence of annealed distributions during the sampling process, allowing us to explore the state space across different dimensionalities more freely. This approach is motivated by the application of layout synthesis where relationships between objects are characterized as constraints. In particular, our method addresses the challenge of synthesizing open world layouts where the number of objects are not fixed and optimal configurations for different numbers of objects may be drastically different. We demonstrate the applicability of our approach on two open world layout synthesis problems: coffee shops and golf courses.}, author = {Yeh, Yi-Ting and Yang, Lingfeng and Watson, Matthew and Goodman, Noah D. and Hanrahan, Pat}, doi = {10.1145/2185520.2185552}, file = {:home/perellm1/.local/share/data/Mendeley Ltd./Mendeley Desktop/Downloaded/3T2FMMHE/Yeh et al. - 2012 - Synthesizing Open Worlds with Constraints Using Lo.pdf:pdf}, issn = {0730-0301}, journal = {ACM Trans. Graph.}, keywords = {constrained synthesis,factor graphs,open worlds}, mendeley-tags = {constrained synthesis,factor graphs,open worlds}, month = jul, number = {4}, pages = {56:1--56:11}, title = {{Synthesizing Open Worlds with Constraints Using Locally Annealed Reversible Jump MCMC}}, url = {http://doi.acm.org/10.1145/2185520.2185552 http://dl.acm.org/ft\_gateway.cfm?id=2185552\&type=pdf}, volume = {31}, year = {2012} }  @article{Zia2013, abstract = {Geometric 3D reasoning at the level of objects has received renewed attention recently in the context of visual scene understanding. The level of geometric detail, however, is typically limited to qualitative representations or coarse boxes. This is linked to the fact that today's object class detectors are tuned toward robust 2D matching rather than accurate 3D geometry, encouraged by bounding-box-based benchmarks such as Pascal VOC. In this paper, we revisit ideas from the early days of computer vision, namely, detailed, 3D geometric object class representations for recognition. These representations can recover geometrically far more accurate object hypotheses than just bounding boxes, including continuous estimates of object pose and 3D wireframes with relative 3D positions of object parts. In combination with robust techniques for shape description and inference, we outperform state-of-the-art results in monocular 3D pose estimation. In a series of experiments, we analyze our approach in detail and demonstrate novel applications enabled by such an object class representation, such as fine-grained categorization of cars and bicycles, according to their 3D geometry, and ultrawide baseline matching.}, author = {Zia, M.Z. and Stark, M. and Schiele, B. and Schindler, K.}, doi = {10.1109/TPAMI.2013.87}, file = {:home/perellm1/.local/share/data/Mendeley Ltd./Mendeley Desktop/Downloaded/A348HZMF/Zia et al. - 2013 - Detailed 3D Representations for Object Recognition.pdf:pdf;:home/perellm1/.local/share/data/Mendeley Ltd./Mendeley Desktop/Downloaded/VZHPWQUH/abs\_all.html:html}, issn = {0162-8828}, journal = {IEEE Transactions on Pattern Analysis and Machine Intelligence}, keywords = {3D geometric object class representations,3D positions,3D representation,3D wireframes,Algorithms,Computational modeling,Computer Simulation,Computer vision,Design automation,Detectors,Geometry,Image Interpretation- Computer-Assisted,Imaging- Three-Dimensional,Models- Theoretical,Pattern Recognition- Automated,Shape,Solid modeling,Three-dimensional displays,artificial intelligence,bounding-box-based benchmarks,computational geometry,fine-grained categorization,geometric 3D reasoning,geometric detail,image 3D reconstruction,image matching,image reconstruction,image representation,inference,inference mechanisms,monocular 3D pose estimation,object class detectors,object modeling,object pose estimation,object recognition,photography,pose estimation,recognition,robust 2D matching,scene understanding,shape description,single image 3D reconstruction,solid modelling,ultrawide baseline matching,visual scene understanding}, mendeley-tags = {3D geometric object class representations,3D positions,3D representation,3D wireframes,Algorithms,Computational modeling,Computer Simulation,Computer vision,Design automation,Detectors,Geometry,Image Interpretation- Computer-Assisted,Imaging- Three-Dimensional,Models- Theoretical,Pattern Recognition- Automated,Shape,Solid modeling,Three-dimensional displays,artificial intelligence,bounding-box-based benchmarks,computational geometry,fine-grained categorization,geometric 3D reasoning,geometric detail,image 3D reconstruction,image matching,image reconstruction,image representation,inference,inference mechanisms,monocular 3D pose estimation,object class detectors,object modeling,object pose estimation,object recognition,photography,pose estimation,recognition,robust 2D matching,scene understanding,shape description,single image 3D reconstruction,solid modelling,ultrawide baseline matching,visual scene understanding}, month = nov, number = {11}, pages = {2608--2623}, title = {{Detailed 3D Representations for Object Recognition and Modeling}}, url = {http://ieeexplore.ieee.org/ielx7/34/6601604/06516504.pdf?tp=\&arnumber=6516504\&isnumber=6601604 http://ieeexplore.ieee.org/xpls/abs\_all.jsp?arnumber=6516504\&tag=1}, volume = {35}, year = {2013} }  @article{Jiang2012, abstract = {Convolutional Neural Networks (CNN) have showed success in achieving translation invariance for many image processing tasks. The success is largely attributed to the use of local filtering and max-pooling in the CNN architecture. In this paper, we propose to apply CNN to speech recognition within the framework of hybrid NN-HMM model. We propose to use local filtering and max-pooling in frequency domain to normalize speaker variance to achieve higher multi-speaker speech recognition performance. In our method, a pair of local filtering layer and max-pooling layer is added at the lowest end of neural network (NN) to normalize spectral variations of speech signals. In our experiments, the proposed CNN architecture is evaluated in a speaker independent speech recognition task using the standard TIMIT data sets. Experimental results show that the proposed CNN method can achieve over 10\% relative error reduction in the core TIMIT test sets when comparing with a regular NN using the same number of hidden layers and weights. Our results also show that the best result of the proposed CNN model is better than previously published results on the same TIMIT test sets that use a pre-trained deep NN model.}, author = {Abdel-Hamid, O and Mohamed, A}, file = {:home/perellm1/.local/share/data/Mendeley Ltd./Mendeley Desktop/Downloaded/Abdel-Hamid, Mohamed - 2012 - Applying convolutional neural networks concepts to hybrid NN-HMM model for speech recognition.pdf:pdf}, journal = {Acoustics, Speech and \ldots}, keywords = {mscthesis}, mendeley-tags = {mscthesis}, title = {{Applying convolutional neural networks concepts to hybrid NN-HMM model for speech recognition}}, url = {http://ieeexplore.ieee.org/xpls/abs\_all.jsp?arnumber=6288864}, year = {2012} }  @article{Ashby1960, author = {Ashby, WR}, file = {:share/imagedb/perellm1/references/Ashby\_1960\_Design for a Brain The Origin of Adaptive Behavior.pdf:pdf}, keywords = {mscthesis}, mendeley-tags = {mscthesis}, title = {{Design for a Brain: The Origin of Adaptive Behavior}}, url = {http://scholar.google.com/scholar?hl=en\&btnG=Search\&q=intitle:Design+for+a+Brain:+The+Origin+of+Adaptive+Behavior\#0}, year = {1960} }  @book{Bengio2009, abstract = {Theoretical results suggest that in order to learn the kind of complicated functions that can represent high-level abstractions (e.g., in vision, language, and other AI-level tasks), one may need deep architectures. Deep architectures are composed of multiple levels of non-linear operations, such as in neural nets with many hidden layers or in complicated propositional formulae re-using many sub-formulae. Searching the parameter space of deep architectures is a difficult task, but learning algorithms such as those for Deep Belief Networks have recently been proposed to tackle this problem with notable success, beating the state-of-the-art in certain areas. This monograph discusses the motivations and principles regarding learning algorithms for deep architectures, in particular those exploiting as building blocks unsupervised learning of single-layer models such as Restricted Boltzmann Machines, used to construct deeper models such as Deep Belief Networks.}, author = {Bengio, Yoshua}, booktitle = {Foundations and Trends® in Machine Learning}, doi = {10.1561/2200000006}, file = {:home/perellm1/.local/share/data/Mendeley Ltd./Mendeley Desktop/Downloaded/Bengio - 2009 - Learning Deep Architectures for AI.pdf:pdf}, isbn = {2200000006}, issn = {1935-8237}, keywords = {mscthesis}, mendeley-tags = {mscthesis}, number = {1}, pages = {1--127}, title = {{Learning Deep Architectures for AI}}, url = {http://dl.acm.org/citation.cfm?id=1658424 https://wiki.eecs.yorku.ca/course\_archive/2012-13/F/6328/\_media/learning-deep-ai.pdf}, volume = {2}, year = {2009} }  @article{Cajal1909, abstract = {Translation of Textura del sistema nervioso del hombre y de los vertebrados; Microfilmed for preservation; t. 1. G\'{e}n\'{e}ralit\'{e}s, moelle, ganglions rachidiens, bulbe$\backslash$\& protub\'{e}rance.-- t. 2. Cervelet, cerveau moyen, r\'{e}tine, couche optique, corps stri\'{e}, \'{e}corce c\'{e}r\'{e}brale g\'{e}n\'{e}rale$\backslash$\& r\'{e}gionale, grand sympathique}, author = {Cajal, Santiago Ramon y}, file = {:share/imagedb/perellm1/references/Cajal\_1909\_Histologie du systeme nerveux de l'homme \& des vertebres.pdf:pdf}, keywords = {Nervous System,mscthesis}, language = {fre}, mendeley-tags = {Nervous System,mscthesis}, pages = {1014}, publisher = {Paris : Maloine}, title = {{Histologie du systeme nerveux de l'homme \& des vertebres}}, url = {http://archive.org/details/histologiedusyst01ram http://scholar.google.com/scholar?hl=en\&btnG=Search\&q=intitle:Histologie+du+syst\{e}me+nerveux+de+l'homme+\&+des+vert\'{e}br\'{e}s\#0}, year = {1909} }  @article{Chai1999, abstract = {This paper addresses our proposed method to automatically segment out a person's face from a given image that consists of a head-and-shoulders view of the person and a complex background scene. The method involves a fast, reliable, and effective algorithm that exploits the spatial distribution characteristics of human skin color. A universal skin-color map is derived and used on the chrominance component of the input image to detect pixels with skin-color appearance. Then, based on the spatial distribution of the detected skin-color pixels and their corresponding luminance values, the algorithm employs a set of novel regularization processes to reinforce regions of skin-color pixels that are more likely to belong to the facial regions and eliminate those that are not. The performance of the face-segmentation algorithm is illustrated by some simulation results carried out on various head-and-shoulders test images. The use of face segmentation for video coding in applications such as videotelephony is then presented. We explain how the face-segmentation results can be used to improve the perceptual quality of a videophone sequence encoded by the H.261-compliant coder}, author = {Chai, D. and Ngan, K.N.}, doi = {10.1109/76.767122}, file = {:home/perellm1/.local/share/data/Mendeley Ltd./Mendeley Desktop/Downloaded/X3M3DRXC/Chai and Ngan - 1999 - Face segmentation using skin-color map in videopho.pdf:pdf;:home/perellm1/.local/share/data/Mendeley Ltd./Mendeley Desktop/Downloaded/TMZQC7ZZ/abs\_all.html:html}, issn = {1051-8215}, journal = {IEEE Transactions on Circuits and Systems for Video Technology}, keywords = {Face detection,Facial animation,H.261-compliant coder,Humans,Image coding,Layout,MPEG 4 Standard,Skin,brightness,chrominance component,complex background scene,face recognition,face-segmentation algorithm,fast algorithm,foreground/background coding,head-and-shoulders view,human skin color,image colour analysis,image segmentation,image sequences,input image,luminance,mscthesis,perceptual quality,pixels,regularization processes,reliable algorithm,simulation results,spatial distribution characteristics,test images,universal skin-color map,video coding,videophone applications,videophone sequence,videotelephony}, mendeley-tags = {Face detection,Facial animation,H.261-compliant coder,Humans,Image coding,Layout,MPEG 4 Standard,Skin,brightness,chrominance component,complex background scene,face recognition,face-segmentation algorithm,fast algorithm,foreground/background coding,head-and-shoulders view,human skin color,image colour analysis,image segmentation,image sequences,input image,luminance,mscthesis,perceptual quality,pixels,regularization processes,reliable algorithm,simulation results,spatial distribution characteristics,test images,universal skin-color map,video coding,videophone applications,videophone sequence,videotelephony}, month = jun, number = {4}, pages = {551--564}, title = {{Face segmentation using skin-color map in videophone applications}}, url = {http://ieeexplore.ieee.org/ielx5/76/16628/00767122.pdf?tp=\&arnumber=767122\&isnumber=16628 http://ieeexplore.ieee.org/xpls/abs\_all.jsp?arnumber=767122}, volume = {9}, year = {1999} }  @article{Cun1988, author = {Cun, Y Le}, file = {:home/perellm1/.local/share/data/Mendeley Ltd./Mendeley Desktop/Downloaded/Cun - 1988 - A theoretical framework for back-propagation.pdf:pdf}, keywords = {mscthesis}, mendeley-tags = {mscthesis}, title = {{A theoretical framework for back-propagation}}, url = {http://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.28.5453}, year = {1988} }  @article{Dong1999, address = {New York, NY, USA}, author = {Dong, Guozhu and Li, Jinyan}, doi = {10.1145/312129.312191}, file = {:home/perellm1/.local/share/data/Mendeley Ltd./Mendeley Desktop/Downloaded/MNUMQ99U/Dong and Li - 1999 - Efficient Mining of Emerging Patterns Discovering.pdf:pdf}, isbn = {1-58113-143-7}, journal = {Proceedings of the fifth ACM SIGKDD international \ldots}, keywords = {mscthesis}, mendeley-tags = {mscthesis}, pages = {43--52}, publisher = {ACM}, series = {KDD '99}, shorttitle = {Efficient Mining of Emerging Patterns}, title = {{Efficient mining of emerging patterns: Discovering trends and differences}}, url = {http://doi.acm.org/10.1145/312129.312191 http://dl.acm.org/ft\_gateway.cfm?id=312191\&type=pdf http://dl.acm.org/citation.cfm?id=312191}, year = {1999} }  @article{Erhan2010, abstract = {Much recent research has been devoted to learning algorithms for deep architectures such as Deep Belief Networks and stacks of auto-encoder variants, with impressive results obtained in several areas, mostly on vision and language data sets. The best results obtained on supervised learning tasks involve an unsupervised learning component, usually in an unsupervised pre-training phase. Even though these new algorithms have enabled training deep models, many questions remain as to the nature of this difficult learning problem. The main question investigated here is the following: how does unsupervised pre-training work? Answering this questions is important if learning in deep architectures is to be further improved. We propose several explanatory hypotheses and test them through extensive simulations. We empirically show the influence of pre-training with respect to architecture depth, model capacity, and number of training examples. The experiments confirm and clarify the advantage of unsupervised pre-training. The results suggest that unsupervised pre-training guides the learning towards basins of attraction of minima that support better generalization from the training data set; the evidence from these results supports a regularization explanation for the effect of pre-training.}, author = {Erhan, D and Bengio, Yoshua and Courville, Aaron}, file = {:home/perellm1/.local/share/data/Mendeley Ltd./Mendeley Desktop/Downloaded/Erhan, Bengio, Courville - 2010 - Why does unsupervised pre-training help deep learning.pdf:pdf}, journal = {\ldots of Machine Learning \ldots}, keywords = {mscthesis}, mendeley-tags = {mscthesis}, number = {2007}, pages = {201--208}, title = {{Why does unsupervised pre-training help deep learning?}}, url = {http://dl.acm.org/citation.cfm?id=1756025 http://machinelearning.wustl.edu/mlpapers/paper\_files/AISTATS2010\_ErhanCBV10.pdf}, volume = {9}, year = {2010} }  @book{Fischler1987, abstract = {This book treats the question of how far we have come in understanding intelligence and in duplicating it mechanically. The major facets of intelligence--reasoning, vision, language and learning are discussed as an approach to contrasting biological intelligence with current computer realizations.}, author = {Fischler, Martin A.}, isbn = {9780201120011}, keywords = {Computers / General,mscthesis}, language = {en}, mendeley-tags = {Computers / General,mscthesis}, month = jan, pages = {364}, publisher = {Addison-Wesley}, shorttitle = {Intelligence}, title = {{Intelligence: The Eye, the Brain, and the Computer}}, url = {http://books.google.fi/books?id=tFMJAAAAIAAJ}, year = {1987} }  @article{Fu2012, abstract = {The rapid development of social video sharing platforms has created a huge demand for automatic video classification and annotation techniques, in particular for videos containing social activities of a group of people (e.g. YouTube video of a wedding reception). Recently, attribute learning has emerged as a promising paradigm for transferring learning to sparsely labelled classes in object or single-object short action classification. In contrast to existing work, this paper for the first time, tackles the problem of attribute learning for understanding group social activities with sparse labels. This problem is more challenging because of the complex multi-object nature of social activities, and the unstructured nature of the activity context. To solve this problem, we (1) contribute an unstructured social activity attribute (USAA) dataset with both visual and audio attributes, (2) introduce the concept of semi-latent attribute space and (3) propose a novel model for learning the latent attributes which alleviate the dependence of existing models on exact and exhaustive manual specification of the attribute-space. We show that our framework is able to exploit latent attributes to outperform contemporary approaches for addressing a variety of realistic multi-media sparse data learning tasks including: multi-task learning, N-shot transfer learning, learning with label noise and importantly zero-shot learning.}, author = {Fu, Yanwei and Hospedales, TM and Xiang, Tao and Gong, Shaogang}, file = {:home/perellm1/.local/share/data/Mendeley Ltd./Mendeley Desktop/Downloaded/Fu et al. - 2012 - Attribute learning for understanding unstructured social activity.pdf:pdf}, journal = {Computer Vision-ECCV 2012}, keywords = {mscthesis}, mendeley-tags = {mscthesis}, title = {{Attribute learning for understanding unstructured social activity}}, url = {http://link.springer.com/chapter/10.1007/978-3-642-33765-9\_38}, year = {2012} }  @article{Fukushima1983, abstract = {A recognition with a large-scale network is simulated on a PDP-11/34 minicomputer and is shown to have a great capability for visual pattern recognition. The model consists of nine layers of cells. The authors demonstrate that the model can be trained to recognize handwritten Arabic numerals even with considerable deformations in shape. A learning-with-a-teacher process is used for the reinforcement of the modifiable synapses in the new large-scale model, instead of the learning-without-a-teacher process applied to a previous model. The authors focus on the mechanism for pattern recognition rather than that for self-organization.}, author = {Fukushima, K. and Miyake, S. and Ito, T.}, doi = {10.1109/TSMC.1983.6313076}, file = {:home/perellm1/.local/share/data/Mendeley Ltd./Mendeley Desktop/Downloaded/KZAMAJCM/Fukushima et al. - 1983 - Neocognitron A neural network model for a mechani.pdf:pdf;:home/perellm1/.local/share/data/Mendeley Ltd./Mendeley Desktop/Downloaded/6JU4D7FV/abs\_all.html:html}, issn = {0018-9472}, journal = {Systems, Man and Cybernetics, \ldots}, keywords = {Biological neural networks,Brain modeling,Computational modeling,Digital simulation,PDP-11/34 minicomputer,Pattern Recognition,Shape,Visualization,handwritten Arabic numerals,large-scale network,learning-with-a-teacher process,modifiable synapses,mscthesis,neural nets,neural network model,recognition,reinforcement,training,visual pattern recognition,visual perception}, mendeley-tags = {Biological neural networks,Brain modeling,Computational modeling,Digital simulation,PDP-11/34 minicomputer,Pattern Recognition,Shape,Visualization,handwritten Arabic numerals,large-scale network,learning-with-a-teacher process,modifiable synapses,mscthesis,neural nets,neural network model,recognition,reinforcement,training,visual pattern recognition,visual perception}, month = sep, number = {5}, pages = {826--834}, shorttitle = {Neocognitron}, title = {{Neocognitron: A neural network model for a mechanism of visual pattern recognition}}, url = {http://ieeexplore.ieee.org/ielx5/21/6313056/06313076.pdf?tp=\&arnumber=6313076\&isnumber=6313056 http://ieeexplore.ieee.org/xpls/abs\_all.jsp?arnumber=6313076\&tag=1 http://ieeexplore.ieee.org/xpls/abs\_all.jsp?arnumber=6313076}, volume = {SMC-13}, year = {1983} }  @book{Ham2000a, abstract = {From the Publisher:This exciting new text covers artificial neural networks,but more specifically,neurocomputing. Neurocomputing is concerned with processing information,which involves a learning process within an artificial neural network architecture. This neural architecture responds to inputs according to a defined learning rule and then the trained network can be used to perform certain tasks depending on the application. Neurocomputing can play an important role in solving certain problems such as pattern recognition,optimization,event classification,control and identification of nonlinear systems,and statistical analysis. "Principles of Neurocomputing for Science and Engineering," unlike other neural networks texts,is written specifically for scientists and engineers who want to apply neural networks to solve complex problems. For each neurocomputing concept,a solid mathematical foundation is presented along with illustrative examples to accompany that particular architecture and associated training algorithm. The book is primarily intended for graduate-level neural networks courses,but in some instances may be used at the undergraduate level. The book includes many detailed examples and an extensive set of end-of-chapter problems.}, author = {Ham, Fredric M. and Kostanic, Ivica}, edition = {1st}, isbn = {0070259666}, keywords = {mscthesis}, mendeley-tags = {mscthesis}, publisher = {McGraw-Hill Higher Education}, title = {{Principles of Neurocomputing for Science and Engineering}}, year = {2000} }  @book{Haykin1994, author = {Haykin, S}, file = {:home/perellm1/.local/share/data/Mendeley Ltd./Mendeley Desktop/Downloaded/Haykin - 1994 - Neural networks a comprehensive foundation.pdf:pdf}, keywords = {mscthesis}, mendeley-tags = {mscthesis}, title = {{Neural networks: a comprehensive foundation}}, url = {http://dl.acm.org/citation.cfm?id=541500}, year = {1994} }  @book{Hecht-Nielsen1989, abstract = {Exploring many aspects of neurocomputers, this book gives an overview of the network theory behind them, including a background review, basic concepts, associative networks, mapping networks, spatiotemporal networks, and adaptive resonance networks.}, author = {Hecht-Nielsen, Robert}, keywords = {Computers / General,mscthesis}, language = {en}, mendeley-tags = {Computers / General,mscthesis}, month = jan, pages = {456}, publisher = {Addison-Wesley Publishing Company}, title = {{Neurocomputing}}, url = {http://books.google.fi/books?id=6YRQAAAAMAAJ http://books.google.com/books?hl=en\&lr=\&id=u6j6HTS-rVQC\&oi=fnd\&pg=PR13\&dq=Neurocomputing\&ots=P7aX3v1AMs\&sig=rdTjIaMsjvryBZNGiv-LSPoO8UA}, year = {1989} }  @article{Hinton2012a, abstract = {Most current speech recognition systems use hidden Markov models (HMMs) to deal with the temporal variability of speech and Gaussian mixture models (GMMs) to determine how well each state of each HMM fits a frame or a short window of frames of coefficients that represents the acoustic input. An alternative way to evaluate the fit is to use a feed-forward neural network that takes several frames of coefficients as input and produces posterior probabilities over HMM states as output. Deep neural networks (DNNs) that have many hidden layers and are trained using new methods have been shown to outperform GMMs on a variety of speech recognition benchmarks, sometimes by a large margin. This article provides an overview of this progress and represents the shared views of four research groups that have had recent successes in using DNNs for acoustic modeling in speech recognition.}, author = {Hinton, Geoffrey and Deng, Li and Yu, Dong and Dahl, George E and Mohamed, Abdel-rahman and Jaitly, Navdeep and Senior, Andrew and Vanhoucke, Vincent and Nguyen, Patrick and Sainath, Tara N and Kingsbury, Brian}, file = {:home/perellm1/.local/share/data/Mendeley Ltd./Mendeley Desktop/Downloaded/Hinton et al. - 2012 - Deep Neural Networks for Acoustic Modeling in Speech Recognition.pdf:pdf}, journal = {IEEE Signal Processing Magazine}, keywords = {CNN,mscthesis}, mendeley-tags = {CNN,mscthesis}, number = {November}, pages = {82--97}, title = {{Deep Neural Networks for Acoustic Modeling in Speech Recognition}}, url = {http://www.cs.toronto.edu/~asamir/papers/SPM\_DNN\_12.pdf}, year = {2012} }  @article{Jones2012, abstract = {Many cells in both the central visual system and other sensory systems exhibit a center surround organization in their receptive field, where the response to a centrally placed stimulus is modified when a surrounding area is also stimulated. This can follow from laterally directed connections in the local circuit at the level of the cell in question but could also involve more complex interactions. In the lateral geniculate nucleus (LGN), the cells relaying the retinal input display a concentric, center surround organization that in part follows from the similar organization characterizing the retinal cells providing their input. However, local thalamic inhibitory interneurons also play a role, and as we examine here, feedback from the visual cortex too. Here, we show in the primate (macaque) that spatially organized cortical feedback provides a clear and differential influence serving to enhance both responses to stimulation within the center of the receptive field and the ability of the nonclassical surround mechanism to attenuate this. In short, both center and surround mechanisms are influenced by the feedback. This dynamically sharpens the spatial focus of the receptive field and introduces nonlinearities from the cortical mechanism into the LGN.}, author = {Jones, HE and Andolina, IM}, doi = {10.1523/JNEUROSCI.0831-12.2012}, file = {:home/perellm1/.local/share/data/Mendeley Ltd./Mendeley Desktop/Downloaded/MWP7KGKJ/Jones et al. - 2012 - Differential Feedback Modulation of Center and Sur.pdf:pdf;:home/perellm1/.local/share/data/Mendeley Ltd./Mendeley Desktop/Downloaded/PUR3CMKX/15946.html:html}, issn = {0270-6474, 1529-2401}, journal = {The Journal of \ldots}, keywords = {mscthesis}, language = {en}, mendeley-tags = {mscthesis}, month = nov, number = {45}, pages = {15946--15951}, title = {{Differential feedback modulation of center and surround mechanisms in parvocellular cells in the visual thalamus}}, url = {http://www.jneurosci.org/content/32/45/15946 http://www.jneurosci.org/content/32/45/15946.full.pdf http://www.jneurosci.org/content/32/45/15946.short http://www.ncbi.nlm.nih.gov/pubmed/23136432}, volume = {32}, year = {2012} }  @article{Krizhevsky2010, abstract = {We describe how to train a two-layer convolutional Deep Belief Network (DBN) on the 1.6 million tiny imagesdataset.When training a convolutional DBN, one must decide what to do with the edge pixels of teh images. Asthe pixels near the edge of an image contribute to the fewest convolutional filter outputs, the model maysee it fit to tailor its few convolutional filters to better model the edge pixels. This is undesirable becaue itusually comes at the expense of a good model for the interior parts of the image. We investigate several waysof dealing with the edge pixels when training a convolutional DBN. Using a combination of locally-connectedconvolutional units and globally-connected units, as well as a few tricks to reduce the effects of overfitting,we achieve state-of-the-art performance in the classification task of the CIFAR-10 subset of the tiny imagesdataset.}, annote = {$\backslash$begin\{itemize\}$\backslash$item Detectors$\backslash$begin\{itemize\}$\backslash$item Harris3D$\backslash$item Cuboid$\backslash$item Hessian$\backslash$item Dense sampling$\backslash$end\{itemize\}$\backslash$item Descriptors$\backslash$begin\{itemize\}$\backslash$item HOG/HOF$\backslash$item HOG3D$\backslash$item ESURF (extended SURF)$\backslash$end\{itemize\}$\backslash$item Datasets$\backslash$begin\{itemize\}$\backslash$item KTH actions$\backslash$begin\{itemize\}$\backslash$item 6 human action classes$\backslash$item walking, jogging, running, boxing, waving and clapping$\backslash$item 25 subjects$\backslash$item 4 scenarios$\backslash$item 2391 video samples$\backslash$item \$http://www.nada.kth.se/cvap/actions/\\backslash$end\{itemize\}$\backslash$item UCF sport actions$\backslash$begin\{itemize\}$\backslash$item 10 human action classes$\backslash$item winging, diving, kicking, weight-lifting, horse-riding, running, skateboarding, swinging, golf swinging and walking$\backslash$item 150 video samples$\backslash$item \$http://crcv.ucf.edu/data/UCF\_Sports\_Action.php\\backslash$end\{itemize\}$\backslash$item Hollywood2 actions$\backslash$begin\{itemize\}$\backslash$item 12 action classes$\backslash$item answering the hone, driving car, eating, fighting, geting out of the car, hand shaking, hugging, kissing, running, sitting down, sitting up, and standing up.$\backslash$item 69 Hollywood movies$\backslash$item 1707 video samples$\backslash$item \$http://www.di.ens.fr/\~{}laptev/actions/hollywood2/\\backslash$end\{itemize\}$\backslash$end\{itemize\}$\backslash$end\{itemize\}}, author = {Krizhevsky, Alex}, file = {:home/perellm1/.local/share/data/Mendeley Ltd./Mendeley Desktop/Downloaded/Krizhevsky - 2010 - Convolutional Deep Belief Networks on CIFAR-10.pdf:pdf}, keywords = {mscthesis}, mendeley-tags = {mscthesis}, pages = {1--9}, title = {{Convolutional Deep Belief Networks on CIFAR-10}}, url = {http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.222.5826\&rep=rep1\&type=pdf}, year = {2010} }  @article{Le2011b, abstract = {Previous work on action recognition has focused on adapting hand-designed local features, such as SIFT or HOG, from static images to the video domain. In this paper, we propose using unsupervised feature learning as a way to learn features directly from video data. More specifically, we present an extension of the Independent Subspace Analysis algorithm to learn invariant spatio-temporal features from unlabeled video data. We discovered that, despite its simplicity, this method performs surprisingly well when combined with deep learning techniques such as stacking and convolution to learn hierarchical representations. By replacing hand-designed features with our learned features, we achieve classification results superior to all previous published results on the Hollywood2, UCF, KTH and YouTube action recognition datasets. On the challenging Hollywood2 and YouTube action datasets we obtain 53.3\% and 75.8\% respectively, which are approximately 5\% better than the current best published results. Further benefits of this method, such as the ease of training and the efficiency of training and prediction, will also be discussed. You can download our code and learned spatio-temporal features here: http://ai.stanford.edu/\~{}wzou/.}, author = {Le, Quoc V. and Zou, Will Y. and Yeung, Serena Y. and Ng, Andrew Y.}, doi = {10.1109/CVPR.2011.5995496}, file = {:home/perellm1/.local/share/data/Mendeley Ltd./Mendeley Desktop/Downloaded/Le et al. - 2011 - Learning hierarchical invariant spatio-temporal features for action recognition with independent subspace analysis.pdf:pdf}, isbn = {978-1-4577-0394-2}, journal = {Cvpr 2011}, keywords = {mscthesis}, mendeley-tags = {mscthesis}, month = jun, pages = {3361--3368}, publisher = {Ieee}, title = {{Learning hierarchical invariant spatio-temporal features for action recognition with independent subspace analysis}}, url = {http://ieeexplore.ieee.org/lpdocs/epic03/wrapper.htm?arnumber=5995496}, year = {2011} }  @article{LeCun1989a, abstract = {The ability of learning networks to generalize can be greatly enhanced by providing constraints from the task domain. This paper demonstrates how such constraints can be integrated into a backpropagation network through the architecture of the network. This approach has been successfully applied to the recognition of handwritten zip code digits provided by the U.S. Postal Service. A single network learns the entire recognition operation, going from the normalized image of the character to the final classification.}, author = {LeCun, Y and Boser, B and Denker, JS and Henderson, D and Howard, RE and Hubbard, W and Jackel, LD}, file = {:share/imagedb/perellm1/references/LeCun et al.\_1989\_Backpropagation applied to handwritten zip code recognition.pdf:pdf;:home/perellm1/.local/share/data/Mendeley Ltd./Mendeley Desktop/Downloaded/LeCun et al. - 1989 - Backpropagation applied to handwritten zip code recognition(2).pdf:pdf}, journal = {Neural \ldots}, keywords = {mscthesis}, mendeley-tags = {mscthesis}, title = {{Backpropagation applied to handwritten zip code recognition}}, url = {http://www.mitpressjournals.org/doi/abs/10.1162/neco.1989.1.4.541}, year = {1989} }  @book{Minsky1969, abstract = {Perceptrons: an introduction to computational geometry is a book written by Marvin Minsky and Seymour Papert and published in 1969. An edition with handwritten corrections and additions was released in the early 1970s. An expanded edition was further published in 1987, containing a chapter dedicated to counter the criticisms made of it in the 1980s.}, author = {Minsky, Marvin and Papert, Seymour}, file = {:home/perellm1/.local/share/data/Mendeley Ltd./Mendeley Desktop/Downloaded/99U9BZJU/index.html:html}, keywords = {mscthesis}, language = {en}, mendeley-tags = {mscthesis}, month = nov, title = {{Perceptrons}}, url = {http://en.wikipedia.org/w/index.php?title=Perceptrons\_(book)\&oldid=612499916}, year = {1969} }  @article{Nair2010, abstract = {Restricted Boltzmann machines were developed using binary stochastic hidden units. These can be generalized by replacing each binary unit by an infinite number of copies that all have the same weights but have progressively more negative biases. The learning and inference rules for these “Stepped Sigmoid Units ” are unchanged. They can be approximated efficiently by noisy, rectified linear units. Compared with binary units, these units learn features that are better for object recognition on the NORB dataset and face verification on the Labeled Faces in the Wild dataset. Unlike binary units, rectified linear units preserve information about relative intensities as information travels through multiple layers of feature detectors. 1.}, author = {Nair, V and Hinton, GE}, file = {:share/imagedb/perellm1/references/Nair, Hinton\_2010\_Rectified linear units improve restricted boltzmann machines.pdf:pdf}, journal = {Proceedings of the 27th International \ldots}, keywords = {mscthesis}, mendeley-tags = {mscthesis}, title = {{Rectified linear units improve restricted boltzmann machines}}, url = {http://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.165.6419 http://machinelearning.wustl.edu/mlpapers/paper\_files/icml2010\_NairH10.pdf}, year = {2010} }  @article{Nathans1986, abstract = {The hypothesis that red-green "color blindness" is caused by alterations in the genes encoding red and green visual pigments has been tested and shown to be correct. Genomic DNA's from 25 males with various red-green color vision deficiencies were analyzed by Southern blot hybridization with the cloned red and green pigment genes as probes. The observed genotypes appear to result from unequal recombination or gene conversion (or both). Together with chromosome mapping experiments, these data identify each of the cloned human visual pigment genes.}, author = {Nathans, J. and Piantanida, TP and Eddy, RL}, issn = {0036-8075}, journal = {Science}, keywords = {Animals,Chromosome Mapping,Chromosomes- Human,Color,Color Perception,Color Vision Defects,DNA,Gene Frequency,Genes,Genetic Variation,Genotype,Humans,Mice,Nucleic Acid Hybridization,Retinal Pigments,X Chromosome}, language = {eng}, mendeley-tags = {Animals,Chromosome Mapping,Chromosomes- Human,Color,Color Perception,Color Vision Defects,DNA,Gene Frequency,Genes,Genetic Variation,Genotype,Humans,Mice,Nucleic Acid Hybridization,Retinal Pigments,X Chromosome}, month = apr, number = {4747}, pages = {203--210}, title = {{Molecular genetics of inherited variation in human color vision}}, url = {http://www.ncbi.nlm.nih.gov/pubmed/3485310 http://www.sciencemag.org/content/232/4747/203.short}, volume = {232}, year = {1986} }  @book{Neelakanta1994, abstract = {Neural Network Modeling offers a cohesive approach to the statistical mechanics and principles of cybernetics as a basis for neural network modeling. It brings together neurobiologists and the engineers who design intelligent automata to understand the physics of collective behavior pertinent to neural elements and the self-control aspects of neurocybernetics. The theoretical perspectives and explanatory projections portray the most current information in the field, some of which counters certain conventional concepts in the visualization of neuronal interactions.}, author = {Neelakanta, P. S. and DeGroff, Dolores}, file = {:share/imagedb/perellm1/references/Neelakanta, DeGroff\_1994\_Neural Network Modeling Statistical Mechanics and Cybernetic Perspectives.pdf:pdf}, isbn = {9780849324888}, keywords = {Computers / Software Development \& Engineering / S,Technology \& Engineering / Electronics / General,mscthesis}, language = {en}, mendeley-tags = {Computers / Software Development \& Engineering / S,Technology \& Engineering / Electronics / General,mscthesis}, month = jul, pages = {260}, publisher = {CRC Press}, shorttitle = {Neural Network Modeling}, title = {{Neural Network Modeling: Statistical Mechanics and Cybernetic Perspectives}}, url = {http://books.google.fi/books?id=owPGSyUJN4MC}, year = {1994} }  @article{Neumann1956, author = {von Neumann, John}, file = {:share/imagedb/perellm1/references/Neumann\_1956\_Probabilistic logics and the synthesis of reliable organisms from unreliable components.pdf:pdf;:share/imagedb/perellm1/references/Neumann\_1956\_Probabilistic logics and the synthesis of reliable organisms from unreliable components(2).pdf:pdf}, journal = {Automata studies}, keywords = {mscthesis}, mendeley-tags = {mscthesis}, pages = {43--98}, title = {{Probabilistic logics and the synthesis of reliable organisms from unreliable components}}, url = {http://books.google.com/books?hl=en\&lr=\&id=oL57iECEeEwC\&oi=fnd\&pg=PA43\&dq=Probabilistic+logics+and+synthesis+of+reliable+organisms+from+unreliable+components\&ots=xvC0tfMSg4\&sig=Abxpf5T80TGeXCy7LgRxCVhCXGU}, year = {1956} }  @article{Ngiam2010, abstract = {Convolutional neural networks (CNNs) have been successfully applied to manytasks such as digit and object recognition. Using convolutional (tied) weightssignificantly reduces the number of parameters that have to be learned, and alsoallows translational invariance to be hard-coded into the architecture. In this pa-per, we consider the problem of learning invariances, rather than relying on hard-coding. We propose tiled convolution neural networks (Tiled CNNs), which usea regular “tiled” pattern of tied weights that does not require that adjacent hiddenunits share identical weights, but instead requires only that hidden units k stepsaway from each other to have tied weights. By pooling over neighboring units,this architecture is able to learn complex invariances (such as scale and rotationalinvariance) beyond translational invariance. Further, it also enjoys much of CNNs’advantage of having a relatively small number of learned parameters (such as easeof learning and greater scalability). We provide an efficient learning algorithm forTiled CNNs based on Topographic ICA, and show that learning complex invariantfeatures allows us to achieve highly competitive results for both the NORB andCIFAR-10 datasets.}, author = {Ngiam, Jiquan and Chen, Zhenghao and Chia, Daniel and Koh, Pan Wei and Ng, Andrew Y.}, file = {:home/perellm1/.local/share/data/Mendeley Ltd./Mendeley Desktop/Downloaded/Ngiam et al. - 2010 - Tiled convolutional neural networks.pdf:pdf}, journal = {Advances in Neural \ldots}, keywords = {CNN,convolutional neural networks,mscthesis}, mendeley-tags = {CNN,convolutional neural networks,mscthesis}, pages = {1--9}, title = {{Tiled convolutional neural networks}}, url = {http://machinelearning.wustl.edu/mlpapers/paper\_files/NIPS2010\_0550.pdf http://papers.nips.cc/paper/4136-tiled-convolutional-neural-networks.pdf}, year = {2010} }  @book{Norberg2005, abstract = {Between 1946 and 1957 computing went from a preliminary, developmental stage to more widespread use accompanied by the beginnings of the digital computer industry. During this crucial decade, spurred by rapid technological advances, the computer enterprise became a major phenomenon. In Computers and Commerce, Arthur Norberg explores the importance of these years in the history of computing by focusing on technical developments and business strategies at two important firms, both established in 1946, Engineering Research Associates (ERA) and Eckert-Mauchly Computer Company (EMCC), from their early activities through their acquisition by Remington Rand.Both ERA and EMCC had their roots in World War II, and in postwar years both firms received major funding from the United States government. Norberg analyzes the interaction between the two companies and the government and examines the impact of this institutional context on technological innovation. He assesses the technical contributions of such key company figures as J. Presper Eckert, John Mauchly, Grace Hopper, and William Norris, analyzing the importance of engineering knowledge in converting theoretical designs into workable machines. Norberg looks at the two firms' operations after 1951 as independent subsidiaries of Remington Rand, and documents the management problems that began after Remington Rand merged with Sperry Gyroscope to form Sperry Rand in 1955.}, author = {Norberg, Arthur L.}, file = {:share/imagedb/perellm1/references/Norberg\_2005\_Computers and Commerce A Study of Technology and Management at Eckert-Mauchly Computer Company, Engineering Research Associ.pdf:pdf}, isbn = {9780262140904}, keywords = {Computers / Computer Engineering,Computers / Computer Science,Computers / History,History / Modern / 20th Century,mscthesis}, language = {en}, mendeley-tags = {Computers / Computer Engineering,Computers / Computer Science,Computers / History,History / Modern / 20th Century,mscthesis}, pages = {366}, publisher = {MIT Press}, shorttitle = {Computers and Commerce}, title = {{Computers and Commerce: A Study of Technology and Management at Eckert-Mauchly Computer Company, Engineering Research Associates, and Remington Rand, 1946 -- 1957}}, url = {http://books.google.fi/books?id=-f7NIGeIU2EC http://books.google.com/books?hl=en\&lr=\&id=-f7NIGeIU2EC\&oi=fnd\&pg=PR7\&dq=Computers+and+Commerce:+A+Study+of+Technology+and+Management+at+Eckert-Mauchly+Computer+Company,+Engineering+Research+Associates,+and+Remington+Rand,+1946-1957\&ots=\_m\_qwx4emm\&sig=NchylLdoPkcVr8o0K4HJq5EkCrE}, year = {2005} }  @article{Oneata2013, abstract = {Action recognition in uncontrolled video is an important and challenging computer vision problem. Recent progress in this area is due to new local features and models that capture spatio-temporal structure between local features, or human-object interactions. Instead of working towards more complex models, we focus on the low-level features and their encoding. We evaluate the use of Fisher vectors as an alternative to bag-of-word histograms to aggregate a small set of state-of-the-art low-level descriptors, in combination with linear classifiers. We present a large and varied set of evaluations, considering (i) classification of short actions in five datasets, (ii) localization of such actions in feature-length movies, and (iii) large-scale recognition of complex events. We find that for basic action recognition and localization MBH features alone are enough for state-of-the-art performance. For complex events we find that SIFT and MFCC features provide complementary cues. On all three problems we obtain state-of-the-art results, while using fewer features and less complex models.}, author = {Oneata, D and Verbeek, Jakob and Schmid, C}, file = {:home/perellm1/.local/share/data/Mendeley Ltd./Mendeley Desktop/Downloaded/Oneata, Verbeek, Schmid - 2013 - Action and event recognition with Fisher vectors on a compact feature set.pdf:pdf}, journal = {IEEE Intenational Conference on Computer Vision (ICCV)}, keywords = {mscthesis,trecvid}, mendeley-tags = {mscthesis,trecvid}, title = {{Action and event recognition with Fisher vectors on a compact feature set}}, url = {http://hal.archives-ouvertes.fr/docs/00/87/36/62/PDF/action\_and\_event\_recognition\_with\_fisher\_vectors.pdf}, year = {2013} }  @book{Reinhard2010, abstract = {This landmark book is the first to describe HDRI technology in its entirety and covers a wide-range of topics, from capture devices to tone reproduction and image-based lighting. The techniques described enable you to produce images that have a dynamic range much closer to that found in the real world, leading to an unparalleled visual experience. As both an introduction to the field and an authoritative technical reference, it is essential to anyone working with images, whether in computer graphics, film, video, photography, or lighting design.New material includes chapters on High Dynamic Range Video Encoding, High Dynamic Range Image Encoding, and High Dynammic Range Display DevicesWritten by the inventors and initial implementors of High Dynamic Range ImagingCovers the basic concepts (including just enough about human vision to explain why HDR images are necessary), image capture, image encoding, file formats, display techniques, tone mapping for lower dynamic range display, and the use of HDR images and calculations in 3D renderingRange and depth of coverage is good for the knowledgeable researcher as well as those who are just starting to learn about High Dynamic Range imaging}, author = {Reinhard, Erik and Heidrich, Wolfgang and Debevec, Paul}, file = {:share/imagedb/perellm1/references/Reinhard, Heidrich, Debevec\_2010\_High dynamic range imaging acquisition, display, and image-based lighting.pdf:pdf}, isbn = {9780080957111}, keywords = {Computers / Computer Graphics,mscthesis}, language = {en}, mendeley-tags = {Computers / Computer Graphics,mscthesis}, month = may, pages = {674}, publisher = {Morgan Kaufmann}, shorttitle = {High Dynamic Range Imaging}, title = {{High dynamic range imaging: acquisition, display, and image-based lighting}}, url = {http://books.google.fi/books?id=w1i\_1kejoYcC http://books.google.com/books?hl=en\&lr=\&id=w1i\_1kejoYcC\&oi=fnd\&pg=PP2\&dq=High+Dynamic+Range+Imaging:+Acquisition,+Display,+and+Image-Based+Lighting\&ots=4iZX1CNJqv\&sig=qtPnOjSJqcdiz1Ys6WRRHB2SmAY}, year = {2010} }  @article{Rigamonti2011, abstract = {Recent years have seen an increasing interest in sparse representations for image classification and object recognition, probably motivated by evidence from the analysis of the primate visual cortex. It is still unclear, however, whether or not sparsity helps classification. In this paper we evaluate its impact on the recognition rate using a shallow modular architecture, adopting both standard filter banks and filter banks learned in an unsupervised way. In our experiments on the CIFAR-10 and on the Caltech-101 datasets, enforcing sparsity constraints actually does not improve recognition performance. This has an important practical impact in image descriptor design, as enforcing these constraints can have a heavy computational cost.}, author = {Rigamonti, Roberto and Brown, Matthew a. and Lepetit, Vincent}, doi = {10.1109/CVPR.2011.5995313}, file = {:home/perellm1/.local/share/data/Mendeley Ltd./Mendeley Desktop/Downloaded/Rigamonti, Brown, Lepetit - 2011 - Are sparse representations really relevant for image classification.pdf:pdf}, isbn = {978-1-4577-0394-2}, journal = {Cvpr 2011}, keywords = {mscthesis}, mendeley-tags = {mscthesis}, month = jun, pages = {1545--1552}, publisher = {Ieee}, title = {{Are sparse representations really relevant for image classification?}}, url = {http://ieeexplore.ieee.org/lpdocs/epic03/wrapper.htm?arnumber=5995313}, year = {2011} }  @article{Schuster1997, abstract = {In the first part of this paper, a regular recurrent neural network (RNN) is extended to a bidirectional recurrent neural network (BRNN). The BRNN can be trained without the limitation of using input information just up to a preset future frame. This is accomplished by training it simultaneously in positive and negative time direction. Structure and training procedure of the proposed network are explained. In regression and classification experiments on artificial data, the proposed structure gives better results than other approaches. For real data, classification experiments for phonemes from the TIMIT database show the same tendency. In the second part of this paper, it is shown how the proposed bidirectional structure can be easily modified to allow efficient estimation of the conditional posterior probability of complete symbol sequences without making any explicit assumption about the shape of the distribution. For this part, experiments on real data are reported}, author = {Schuster, M. and Paliwal, Kuldip K.}, doi = {10.1109/78.650093}, file = {:home/perellm1/.local/share/data/Mendeley Ltd./Mendeley Desktop/Downloaded/9WKI68UZ/Schuster and Paliwal - 1997 - Bidirectional recurrent neural networks.pdf:pdf;:home/perellm1/.local/share/data/Mendeley Ltd./Mendeley Desktop/Downloaded/VD9NATX9/abs\_all.html:html}, issn = {1053-587X}, journal = {IEEE Transactions on Signal Processing}, keywords = {Artificial neural networks,Control systems,Databases,Parameter estimation,Probability,Recurrent neural networks,Shape,TIMIT database,Telecommunication control,Training data,artificial data,bidirectional recurrent neural networks,classification experiments,complete symbol sequences,conditional posterior probability,learning by example,learning from examples,mscthesis,negative time direction,pattern classification,phonemes,positive time direction,real data,recurrent neural nets,regression experiments,regular recurrent neural network,speech processing,speech recognition,statistical analysis,training}, mendeley-tags = {Artificial neural networks,Control systems,Databases,Parameter estimation,Probability,Recurrent neural networks,Shape,TIMIT database,Telecommunication control,Training data,artificial data,bidirectional recurrent neural networks,classification experiments,complete symbol sequences,conditional posterior probability,learning by example,learning from examples,mscthesis,negative time direction,pattern classification,phonemes,positive time direction,real data,recurrent neural nets,regression experiments,regular recurrent neural network,speech processing,speech recognition,statistical analysis,training}, month = nov, number = {11}, pages = {2673--2681}, title = {{Bidirectional recurrent neural networks}}, url = {http://ieeexplore.ieee.org/ielx4/78/14188/00650093.pdf?tp=\&arnumber=650093\&isnumber=14188 http://ieeexplore.ieee.org/xpls/abs\_all.jsp?arnumber=650093\&tag=1}, volume = {45}, year = {1997} }  @book{Seeger2004, abstract = {Gaussian processes (GPs) are natural generalisations of multivariate Gaussian random variables to infinite (countably or continuous) index sets. GPs have been applied in a large number of fields to a diverse range of ends, and very many deep theoretical analyses of various properties are available. This paper gives an introduction to Gaussian processes on a fairly elementary level with special emphasis on characteristics relevant in machine learning. It draws explicit connections to branches such as spline smoothing models and support vector machines in which similar ideas have been investigated. Gaussian process models are routinely used to solve hard machine learning problems. They are attractive because of their flexible non-parametric nature and computational simplicity. Treated within a Bayesian framework, very powerful statistical methods can be implemented which offer valid estimates of uncertainties in our predictions and generic model selection procedures cast as nonlinear optimization problems. Their main drawback of heavy computational scaling has recently been alleviated by the introduction of generic sparse approximations.13,78,31 The mathematical literature on GPs is large and often uses deep concepts which are not required to fully understand most machine learning applications. In this tutorial paper, we aim to present characteristics of GPs relevant to machine learning and to show up precise connections to other "kernel machines" popular in the community. Our focus is on a simple presentation, but references to more detailed sources are provided.}, author = {Seeger, Matthias}, booktitle = {International journal of neural systems}, file = {:home/perellm1/.local/share/data/Mendeley Ltd./Mendeley Desktop/Downloaded/Seeger - 2004 - Gaussian processes for machine learning.pdf:pdf}, isbn = {026218253X}, issn = {0129-0657}, keywords = {Algorithms,Artificial Intelligence,Bayes Theorem,Entropy,Linear Models,Models, Statistical,Normal Distribution,Regression Analysis,Statistics, Nonparametric,mscthesis}, mendeley-tags = {mscthesis}, month = may, number = {2}, pages = {69--106}, pmid = {15112367}, title = {{Gaussian processes for machine learning.}}, url = {http://www.ncbi.nlm.nih.gov/pubmed/15112367}, volume = {14}, year = {2004} }  @article{Springenberg2013, abstract = {We present a probabilistic variant of the recently introduced maxout unit. The success of deep neural networks utilizing maxout can partly be attributed to favorable performance under dropout, when compared to rectified linear units. It however also depends on the fact that each maxout unit performs a pooling operation over a group of linear transformations and is thus partially invariant to changes in its input. Starting from this observation we ask the question: Can the desirable properties of maxout units be preserved while improving their invariance properties ? We argue that our probabilistic maxout (probout) units successfully achieve this balance. We quantitatively verify this claim and report classification performance matching or exceeding the current state of the art on three challenging image classification benchmarks (CIFAR-10, CIFAR-100 and SVHN).}, archiveprefix = {arXiv}, arxivid = {1312.6116}, author = {Springenberg, Jost Tobias and Riedmiller, Martin}, eprint = {1312.6116}, file = {:share/imagedb/perellm1/references/Springenberg, Riedmiller\_2013\_Improving Deep Neural Networks with Probabilistic Maxout Units.pdf:pdf}, keywords = {mscthesis}, mendeley-tags = {mscthesis}, month = dec, title = {{Improving Deep Neural Networks with Probabilistic Maxout Units}}, url = {http://arxiv.org/abs/1312.6116}, year = {2013} }  @article{Szegedy2013, abstract = {Deep neural networks are highly expressive models that have recently achieved state of the art performance on speech and visual recognition tasks. While their expressiveness is the reason they succeed, it also causes them to learn uninterpretable solutions that could have counter-intuitive properties. In this paper we report two such properties. First, we find that there is no distinction between individual high level units and random linear combinations of high level units, according to various methods of unit analysis. It suggests that it is the space, rather than the individual units, that contains of the semantic information in the high layers of neural networks. Second, we find that deep neural networks learn input-output mappings that are fairly discontinuous to a significant extend. We can cause the network to misclassify an image by applying a certain imperceptible perturbation, which is found by maximizing the network's prediction error. In addition, the specific nature of these perturbations is not a random artifact of learning: the same perturbation can cause a different network, that was trained on a different subset of the dataset, to misclassify the same input.}, author = {Szegedy, Christian and Zaremba, W and Sutskever, I}, file = {:home/perellm1/.local/share/data/Mendeley Ltd./Mendeley Desktop/Downloaded/Szegedy, Zaremba, Sutskever - 2013 - Intriguing properties of neural networks.pdf:pdf}, journal = {arXiv preprint arXiv: \ldots}, keywords = {mscthesis}, mendeley-tags = {mscthesis}, pages = {1--9}, title = {{Intriguing properties of neural networks}}, url = {http://arxiv.org/abs/1312.6199}, year = {2013} }  @article{Taylor2010, abstract = {We address the problem of learning good features for understanding video data. We introduce a model that learns latent representations of image sequences from pairs of successive images. The convolutional architecture of our model allows it to scale to realistic image sizes whilst using a compact parametrization. In experiments on the NORB dataset, we show our model extracts latent “flow fields” which correspond to the transformation between the pair of input frames. We also use our model to extract low-level motion features in a multi-stage architecture for action recognition, demonstrating competitive performance on both the KTH and Hollywood2 datasets.}, author = {Taylor, GW and Fergus, Rob and LeCun, Y and Bregler, Christoph}, file = {:home/perellm1/.local/share/data/Mendeley Ltd./Mendeley Desktop/Downloaded/Taylor et al. - 2010 - Convolutional learning of spatio-temporal features.pdf:pdf}, journal = {Computer Vision-ECCV 2010}, keywords = {activity recognition,con-,mscthesis,optical flow,restricted boltzmann machines,unsupervised learning,video analysis,volutional nets}, mendeley-tags = {mscthesis}, title = {{Convolutional learning of spatio-temporal features}}, url = {http://link.springer.com/chapter/10.1007/978-3-642-15567-3\_11 http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.178.9267\&rep=rep1\&type=pdf}, year = {2010} }  @article{von1976mechanism, author = {von der Malsburg, Christoph and Willshaw, David J}, journal = {Exp. Brain Res}, keywords = {mscthesis}, mendeley-tags = {mscthesis}, pages = {463--469}, title = {{A mechanism for producing continuous neural mappings: ocularity dominance stripes and ordered retino-tectal projections}}, volume = {1}, year = {1976} }  @article{Willshaw1969a, abstract = {The features of a hologram that commend it as a model of associative memory can be improved on by other devices.}, author = {Willshaw, D. J. and Buneman, O. P. and Longuet-Higgins, H. C.}, doi = {10.1038/222960a0}, file = {:home/perellm1/.local/share/data/Mendeley Ltd./Mendeley Desktop/Downloaded/AEFI28Q8/Willshaw et al. - 1969 - Non-Holographic Associative Memory.pdf:pdf;:home/perellm1/.local/share/data/Mendeley Ltd./Mendeley Desktop/Downloaded/HNW83SVA/222960a0.html:html}, journal = {Nature}, keywords = {mscthesis}, language = {en}, mendeley-tags = {mscthesis}, month = jun, number = {5197}, pages = {960--962}, title = {{Non-Holographic Associative Memory}}, url = {http://www.nature.com/nature/journal/v222/n5197/abs/222960a0.html http://www.nature.com/nature/journal/v222/n5197/pdf/222960a0.pdf}, volume = {222}, year = {1969} }  @article{Zeiler2011, abstract = {We present a hierarchical model that learns image decompositions via alternating layers of convolutional sparse coding and max pooling. When trained on natural images, the layers of our model capture image information in a variety of forms: low-level edges, mid-level edge junctions, high-level object parts and complete objects. To build our model we rely on a novel inference scheme that ensures each layer reconstructs the input, rather than just the output of the layer directly beneath, as is common with existing hierarchical approaches. This makes it possible to learn multiple layers of representation and we show models with 4 layers, trained on images from the Caltech-101 and 256 datasets. When combined with a standard classifier, features extracted from these models outperform SIFT, as well as representations from other feature learning methods}, author = {Zeiler, Matthew D. and Taylor, Graham W. and Fergus, Rob}, doi = {10.1109/ICCV.2011.6126474}, file = {:home/perellm1/.local/share/data/Mendeley Ltd./Mendeley Desktop/Downloaded/Zeiler, Taylor, Fergus - 2011 - Adaptive deconvolutional networks for mid and high level feature learning.pdf:pdf}, isbn = {978-1-4577-1102-2}, journal = {2011 International Conference on Computer Vision}, keywords = {mscthesis}, mendeley-tags = {mscthesis}, month = nov, pages = {2018--2025}, publisher = {Ieee}, title = {{Adaptive deconvolutional networks for mid and high level feature learning}}, url = {http://ieeexplore.ieee.org/lpdocs/epic03/wrapper.htm?arnumber=6126474}, year = {2011} }  @article{Zhang2014, abstract = {Semantic part localization can facilitate fine-grained categorization by explicitly isolating subtle appearance differences associated with specific object parts. Methods for pose-normalized representations have been proposed, but generally presume bounding box annotations at test time due to the difficulty of object detection. We propose a model for fine-grained categorization that overcomes these limitations by leveraging deep convolutional features computed on bottom-up region proposals. Our method learns whole-object and part detectors, enforces learned geometric constraints between them, and predicts a fine-grained category from a pose-normalized representation. Experiments on the Caltech-UCSD bird dataset confirm that our method outperforms state-of-the-art fine-grained categorization methods in an end-to-end evaluation without requiring a bounding box at test time.}, annote = {$\backslash$begin\{itemize\}$\backslash$item fine-grained category detection: detection and classification intra-class (Example: face recognition, dog breeds, and others)$\backslash$item Got state-of-the-art without bounding box at test time$\backslash$item Other approaches use Deformable Parts Model (DPM) plus engineered features (Example: HOG)$\backslash$item Use of R-CNN to localize objects and generalizes to localize parts$\backslash$item Use of Alexnet CNN pretrained with ImageNet and finetuned for detection$\backslash$begin\{itemize\}$\backslash$item Substitute last fc8 1000 to 200$\backslash$item learning rate global = original:10$\backslash$item learning rate of fc8 globalx10$\backslash$item Decrease global by 10 during learning$\backslash$end\{itemize\}$\backslash$item They add learned non-parametric geometric constraints$\backslash$begin\{itemize\}$\backslash$item Mixture of Gaussians with 4 components and \$\backslash alpha = 0.1\\backslash$item K nearest neighbors with \$K = 20\\backslash$item All hyperparameters found by 5 folds cross-validation$\backslash$end\{itemize\}$\backslash$item Use of the fc6 to train the R-CNN object and part detector$\backslash$item Use the pool5 for the geometric constraints$\backslash$item Results$\backslash$begin\{itemize\}$\backslash$item Caltech-UCSD bird with K-nearest Finetuning increases from 68.1$\backslash$\% to 76$\backslash$\%$\backslash$item Without bounding box at test time from 66$\backslash$\% to 73.89$\backslash$\%$\backslash$end\{itemize\}$\backslash$item conclusion$\backslash$begin\{itemize\}$\backslash$item For fine-grained discrimination is very useful pose and locality information$\backslash$item Future exploration on automatically discover and model parts as latent variables$\backslash$end\{itemize\}$\backslash$end\{itemize\}}, author = {Zhang, Ning and Donahue, Jeff and Girshick, Ross and Darrell, Trevor}, editor = {Fleet, David and Pajdla, Tomas and Schiele, Bernt and Tuytelaars, Tinne}, file = {:home/perellm1/.local/share/data/Mendeley Ltd./Mendeley Desktop/Downloaded/T4AZFJMS/Zhang et al. - 2014 - Part-Based R-CNNs for Fine-Grained Category Detect.pdf:pdf;:home/perellm1/.local/share/data/Mendeley Ltd./Mendeley Desktop/Downloaded/HJEWEUZC/978-3-319-10590-1\_54.html:html}, isbn = {978-3-319-10589-5, 978-3-319-10590-1}, journal = {Computer Vision–ECCV 2014}, keywords = {Artificial Intelligence (incl. Robotics),Computer Graphics,Fine-grained recognition,Image Processing and Computer Vision,Object detection,Pattern Recognition,convolutional models,mscthesis}, language = {en}, mendeley-tags = {Artificial Intelligence (incl. Robotics),Computer Graphics,Fine-grained recognition,Image Processing and Computer Vision,Object detection,Pattern Recognition,convolutional models,mscthesis}, month = jan, pages = {834--849}, publisher = {Springer International Publishing}, series = {Lecture Notes in Computer Science}, title = {{Part-based R-CNNs for fine-grained category detection}}, url = {http://link.springer.com/chapter/10.1007/978-3-319-10590-1\_54 http://link.springer.com/content/pdf/10.1007/978-3-319-10590-1\_54.pdf}, year = {2014} }  @article{Amari1972, abstract = {The dynamic behavior of randomly connected analog neuron-like elements that process pulse-frequency modulated signals is investigated from the macroscopic point of view. By extracting two statistical parameters, the macroscopic state equations are derived in terms of these parameters under some hypotheses on the stochastics of microscopic states. It is shown that a random net of statistically symmetric structure is monostable or bistable, and the stability criteria are explicitly given. Random nets consisting of many different classes of elements are also analyzed. Special attention is paid to nets of randomly connected excitatory and inhibitory elements. It is shown that a stable oscillation exists in such a net\^{A}¿in contrast with the fact that no stable oscillations exist in a net of statistically symmetric structure even if negative as well as positive synaptic weights are permitted at a time. The results are checked by computer-simulated experiments.}, author = {Amari, S.-I.}, doi = {10.1109/TSMC.1972.4309193}, file = {:home/perellm1/.local/share/data/Mendeley Ltd./Mendeley Desktop/Downloaded/UXGUIVFM/Amari - 1972 - Characteristics of Random Nets of Analog Neuron-Li.pdf:pdf;:home/perellm1/.local/share/data/Mendeley Ltd./Mendeley Desktop/Downloaded/GSP8IIVC/articleDetails.html:html}, issn = {0018-9472}, journal = {IEEE Transactions on Systems, Man and Cybernetics}, keywords = {Biological neural networks,Equations,Frequency,Instruments,Microscopy,Neurons,Pulse modulation,Signal processing,Stability criteria,Stochastic processes,mscthesis}, mendeley-tags = {Biological neural networks,Equations,Frequency,Instruments,Microscopy,Neurons,Pulse modulation,Signal processing,Stability criteria,Stochastic processes,mscthesis}, month = nov, number = {5}, pages = {643--657}, title = {{Characteristics of Random Nets of Analog Neuron-Like Elements}}, url = {http://ieeexplore.ieee.org/ielx5/21/4309177/04309193.pdf?tp=\&arnumber=4309193\&isnumber=4309177 http://ieeexplore.ieee.org/xpl/articleDetails.jsp?tp=\&arnumber=4309193\&queryText=.QT.Characteristics+of+random+nets+of+analog+neuron-like+elements.}, volume = {SMC-2}, year = {1972} }  @article{Anderson1972, abstract = {A model of a neural system where a group of neurons projects to another group of neurons is discussed. We assume that a trace is the simultaneous pattern of individual activities shown by a group of neurons. We assume synaptic interactions add linearly and that synaptic weights (quantitative measure of degree of coupling between two cells) can be coded in a simple but optimal way where changes in synaptic weight are proportional to the product of pre-and postsynaptic activity at a given time. Then it is shown that this simple system is capable of “memory” in the sense that it can (1) recognize a previously presented trace and (2) if two traces have been associated in the past (that is, if trace f̄ was impressed on the first group of neurons and trace ḡ was impressed on the second group of neurons and synaptic weights coupling the two groups changed according to the above rule) presentation of f̄ to the first group of neurons gives rise to f̄ plus a calculable amount of noise at the second set of neurons. This kind of memory is called an “interactive memory” since distinct stored traces interact in storage. It is shown that this model can effectively perform many functions. Quantitative expressions are derived for the average signal to noise ratio for recognition and one type of association. The selectivity of the system is discussed. References to physiological data are made where appropriate. A sketch of a model of mammalian cerebral cortex which generates an interactive memory is presented and briefly discussed. We identify a trace with the activity of groups of cortical pyramidal cells. Then it is argued that certain plausible assumptions about the properties of the synapses coupling groups of pyramidal cells lead to the generation of an interactive memory.}, author = {Anderson, JA}, doi = {10.1016/0025-5564(72)90075-2}, file = {:home/perellm1/.local/share/data/Mendeley Ltd./Mendeley Desktop/Downloaded/QFDJMQKG/0025556472900752.html:html}, issn = {0025-5564}, journal = {Mathematical Biosciences}, keywords = {mscthesis}, mendeley-tags = {mscthesis}, month = aug, number = {3-4}, pages = {197--220}, title = {{A simple neural network generating an interactive memory}}, url = {http://www.sciencedirect.com/science/article/pii/0025556472900752}, volume = {14}, year = {1972} }  @article{Qu, author = {Ayache, St\'{e}phane and Qu\'{e}not, Georges and Gensel, J\'{e}r\^{o}me}, file = {:share/imagedb/perellm1/references/Ayache, Qu\'{e}not, Gensel\_2007\_Classifier fusion for SVM-based multimedia semantic indexing.pdf:pdf}, journal = {Springer Berlin Heidelberg}, keywords = {mscthesis}, mendeley-tags = {mscthesis}, title = {{Classifier fusion for SVM-based multimedia semantic indexing}}, year = {2007} }  @book{Bezdek2013, abstract = {?Optimization has long been a source of both inspiration and applications for geometers, and conversely, discrete and convex geometry have provided the foundations for many optimization techniques, leading to a rich interplay between these subjects. The purpose of the Workshop on Discrete Geometry, the Conference on Discrete Geometry and Optimization, and the Workshop on Optimization, held in September 2011 at the Fields Institute, Toronto, was to further stimulate the interaction between geometers and optimizers. This volume reflects the interplay between these areas. The inspiring Fejes T\'{o}th Lecture Series, delivered by Thomas Hales of the University of Pittsburgh, exemplified this approach. While these fields have recently witnessed a lot of activity and successes, many questions remain open. For example, Fields medalist Stephen Smale stated that the question of the existence of a strongly polynomial time algorithm for linear optimization is one of the most important unsolved problems at the beginning of the 21st century. The broad range of topics covered in this volume demonstrates the many recent and fruitful connections between different approaches, and features novel results and state-of-the-art surveys as well as open problems.}, author = {Bezdek, Karoly and Deza, Antoine and Ye, Yinyu}, isbn = {9783319002002}, keywords = {Mathematics / Geometry / General,Mathematics / Optimization,mscthesis}, language = {en}, mendeley-tags = {Mathematics / Geometry / General,Mathematics / Optimization,mscthesis}, month = jul, pages = {341}, publisher = {Springer Science \& Business Media}, title = {{Discrete geometry and optimization}}, url = {http://books.google.fi/books?id=vE7CBAAAQBAJ http://link.springer.com/content/pdf/10.1007/978-3-319-00200-2.pdf}, year = {2013} }  @article{Broomhead1988, abstract = {The relationship between 'learning' in adaptive layered networks and the fitting of data with high dimensional surfaces is discussed. This leads naturally to a picture of 'generalization in terms of interpolation between known data points and suggests a rational approach to the theory of such networks. A class of adaptive networks is identified which makes the interpolation scheme explicit. This class has the property that learning is equivalent to the solution of a set of linear equations. These networks thus represent nonlinear relationships while having a guaranteed learning rule. Great Britain.}, author = {Broomhead, DS and Lowe, David}, file = {:home/perellm1/.local/share/data/Mendeley Ltd./Mendeley Desktop/Downloaded/P62E5465/oai.html:html;:home/perellm1/.local/share/data/Mendeley Ltd./Mendeley Desktop/Downloaded/JB7KFTKP/Broomhead and Lowe - 1988 - Radial Basis Functions, Multi-Variable Functional .pdf:pdf}, keywords = {*ADAPTIVE SYSTEMS,*INTERPOLATION,*LEARNING,*MULTIVARIATE ANALYSIS,*NETWORKS,Active \& Passive Radar Detection \& Equipment,DATA BASES,FITTINGS,FOREIGN REPORTS,LAYERS,LINEAR DIFFERENTIAL EQUATIONS,NONLINEAR SYSTEMS,SIZES(DIMENSIONS),SOLUTIONS(GENERAL),SURFACES,UNITED KINGDOM,mscthesis}, language = {en}, mendeley-tags = {*ADAPTIVE SYSTEMS,*INTERPOLATION,*LEARNING,*MULTIVARIATE ANALYSIS,*NETWORKS,Active \& Passive Radar Detection \& Equipment,DATA BASES,FITTINGS,FOREIGN REPORTS,LAYERS,LINEAR DIFFERENTIAL EQUATIONS,NONLINEAR SYSTEMS,SIZES(DIMENSIONS),SOLUTIONS(GENERAL),SURFACES,UNITED KINGDOM,mscthesis}, month = mar, title = {{Radial basis functions, multi-variable functional interpolation and adaptive networks}}, url = {http://oai.dtic.mil/oai/oai?verb=getRecord\&metadataPrefix=html\&identifier=ADA196234 http://www.dtic.mil/cgi-bin/GetTRDoc?Location=U2\&doc=GetTRDoc.pdf\&AD=ADA196234}, year = {1988} }  @article{Burt1983, abstract = {We describe a technique for image encoding in which local operators of many scales but identical shape serve as the basis functions. The representation differs from established techniques in that the code elements are localized in spatial frequency as well as in space. Pixel-to-pixel correlations are first removed by subtracting a lowpass filtered copy of the image from the image itself. The result is a net data compression since the difference, or error, image has low variance and entropy, and the low-pass filtered image may represented at reduced sample density. Further data compression is achieved by quantizing the difference image. These steps are then repeated to compress the low-pass image. Iteration of the process at appropriately expanded scales generates a pyramid data structure. The encoding process is equivalent to sampling the image with Laplacian operators of many scales. Thus, the code tends to enhance salient image features. A further advantage of the present code is that it is well suited for many image analysis tasks as well as for image compression. Fast algorithms are described for coding and decoding.}, author = {Burt, P.J. and Adelson, E.H.}, doi = {10.1109/TCOM.1983.1095851}, file = {:home/perellm1/.local/share/data/Mendeley Ltd./Mendeley Desktop/Downloaded/FDFX5H8M/Burt and Adelson - 1983 - The Laplacian Pyramid as a Compact Image Code.pdf:pdf;:home/perellm1/.local/share/data/Mendeley Ltd./Mendeley Desktop/Downloaded/M9X9E5DE/abs\_all.html:html}, issn = {0090-6778}, journal = {Communications, IEEE Transactions on}, keywords = {Data compression,Data structures,Entropy,Frequency,Image coding,Image sampling,Laplace equations,Low pass filters,Pixel,Shape}, mendeley-tags = {Data compression,Data structures,Entropy,Frequency,Image coding,Image sampling,Laplace equations,Low pass filters,Pixel,Shape}, month = apr, number = {4}, pages = {532--540}, title = {{The Laplacian pyramid as a compact image code}}, url = {http://ieeexplore.ieee.org/ielx5/26/23970/01095851.pdf?tp=\&arnumber=1095851\&isnumber=23970 http://ieeexplore.ieee.org/xpls/abs\_all.jsp?arnumber=1095851\&tag=1 http://ieeexplore.ieee.org/xpls/abs\_all.jsp?arnumber=1095851}, volume = {31}, year = {1983} }  @article{Churchland1988, abstract = {How is it that we can perceive, learn and be aware of the world? The development of new techniques for studying large-scale brain activity, together with insights from computational modeling and a better understanding of cognitive processes, have opened the door for collaborative research that could lead to major advances in our understanding of ourselves.}, author = {Churchland, PS and Sejnowski, TJ}, doi = {10.1126/science.3055294}, file = {:home/perellm1/.local/share/data/Mendeley Ltd./Mendeley Desktop/Downloaded/AX5GWUX9/Churchland and Sejnowski - 1988 - Perspectives on cognitive neuroscience.pdf:pdf;:home/perellm1/.local/share/data/Mendeley Ltd./Mendeley Desktop/Downloaded/5Z4WFP4J/741.html:html}, issn = {0036-8075, 1095-9203}, journal = {Science}, language = {en}, month = nov, number = {4879}, pages = {741--745}, title = {{Perspectives on cognitive neuroscience}}, url = {http://www.sciencemag.org/content/242/4879/741 http://www.ncbi.nlm.nih.gov/pubmed/3055294 http://www.sciencemag.org/content/242/4879/741.full.pdf http://www.sciencemag.org/content/242/4879/741.short}, volume = {242}, year = {1988} }  @article{Ciresan2013, abstract = {We use deep max-pooling convolutional neural networks to detect mitosis in breast histology images. The networks are trained to classify each pixel in the images, using as context a patch centered on the pixel. Simple postprocessing is then applied to the network output. Our approach won the ICPR 2012 mitosis detection competition, outperforming other contestants by a significant margin.}, author = {Ciresan, DC and Giusti, Alessandro and Schmidhuber, J\"{u}rgen}, file = {:home/perellm1/.local/share/data/Mendeley Ltd./Mendeley Desktop/Downloaded/Cireşan, Giusti - 2013 - Mitosis detection in breast cancer histology images with deep neural networks.pdf:pdf}, journal = {Medical Image \ldots}, keywords = {CNN,mscthesis}, mendeley-tags = {CNN,mscthesis}, title = {{Mitosis detection in breast cancer histology images with deep neural networks}}, url = {http://link.springer.com/chapter/10.1007/978-3-642-40763-5\_51 http://www.idsia.ch/~ciresan/data/miccai2013.pdf}, year = {2013} }  @book{Copeland2012, abstract = {The mathematical genius Alan Turing, now well known for his crucial wartime role in breaking the ENIGMA code, was the first to conceive of the fundamental principle of the modern computer-the idea of controlling a computing machine's operations by means of a program of coded instructions, stored in the machine's 'memory'. In 1945 Turing drew up his revolutionary design for an electronic computing machine-his Automatic Computing Engine ('ACE'). A pilot model of the ACE ran its first program in 1950 and the production version, the 'DEUCE', went on to become a cornerstone of the fledgling British computer industry. The first 'personal' computer was based on Turing's ACE. Alan Turing's Automatic Computing Engine describes Turing's struggle to build the modern computer. The first detailed history of Turing's contributions to computer science, this text is essential reading for anyone interested in the history of the computer and the history of mathematics. It contains first hand accounts by Turing and by the pioneers of computing who worked with him. As well as relating the story of the invention of the computer, the book clearly describes the hardware and software of the ACE-including the very first computer programs. The book is intended to be accessible to everyone with an interest in computing, and contains numerous diagrams and illustrations as well as original photographs. The book contains chapters describing Turing's path-breaking research in the fields of Artificial Intelligence (AI) and Artificial Life (A-Life). The book has an extensive system of hyperlinks to The Turing Archive for the History of Computing, an on-line library of digital facsimiles of typewritten documents by Turing and the other scientists who pioneered the electronic computer.}, author = {Copeland, BJ}, file = {:share/imagedb/perellm1/references/Copeland\_2012\_Alan Turing's Electronic Brain The Struggle to Build the ACE, the World's Fastest Computer.pdf:pdf}, isbn = {9780199609154}, keywords = {Biography \& Autobiography / Science \& Technology,Computers / Social Aspects / General,Computers / Systems Architecture / General,Mathematics / General,Mathematics / History \& Philosophy,Philosophy / General,Science / History,Science / Philosophy \& Social Aspects,mscthesis}, language = {en}, mendeley-tags = {Biography \& Autobiography / Science \& Technology,Computers / Social Aspects / General,Computers / Systems Architecture / General,Mathematics / General,Mathematics / History \& Philosophy,Philosophy / General,Science / History,Science / Philosophy \& Social Aspects,mscthesis}, month = may, pages = {581}, publisher = {Oxford University Press}, shorttitle = {Alan Turing's Electronic Brain}, title = {{Alan Turing's Electronic Brain: The Struggle to Build the ACE, the World's Fastest Computer}}, url = {http://books.google.fi/books?id=YhQZnczOS7kC http://books.google.com/books?hl=en\&lr=\&id=YhQZnczOS7kC\&oi=fnd\&pg=PP2\&dq=Alan+Turing\%27s+Electronic+Brain:+The+Struggle+to+Build+the+ACE,+the+World\%27s+Fastest+Computer\&ots=IhF85KWtPq\&sig=CtqSqqtZXVZXTLp9\_UIhP4QpPEQ}, year = {2012} }  @article{Cragg1955, author = {Cragg, B. G. and Temperley, H. N. V.}, doi = {10.1093/brain/78.2.304}, file = {:home/perellm1/.local/share/data/Mendeley Ltd./Mendeley Desktop/Downloaded/XD7BGUIG/304.html:html}, issn = {0006-8950, 1460-2156}, journal = {Brain}, keywords = {mscthesis}, language = {en}, mendeley-tags = {mscthesis}, month = jun, number = {2}, pages = {304--316}, shorttitle = {Memory}, title = {{Memory: The Analogy with Ferromagnetic Hysteresis}}, url = {http://brain.oxfordjournals.org/content/78/2/304 http://www.ncbi.nlm.nih.gov/pubmed/13239912}, volume = {78}, year = {1955} }  @article{Farley1954, abstract = {A general discussion of ideas and definitions relating to self-organizing systems and their synthesis is given, together with remarks concerning their simulation by digital computer. Synthesis and simulation of an actual system is then described. This system, initially randomly organized within wide limits, organizes itself to perform a simple prescribed task.}, author = {Farley, B.G. and Clark, W.}, doi = {10.1109/TIT.1954.1057468}, file = {:home/perellm1/.local/share/data/Mendeley Ltd./Mendeley Desktop/Downloaded/AKG73JFP/Farley and Clark - 1954 - Simulation of self-organizing systems by digital c.pdf:pdf;:home/perellm1/.local/share/data/Mendeley Ltd./Mendeley Desktop/Downloaded/NR38GGM8/abs\_all.html:html}, issn = {2168-2690}, journal = {\ldots of the IRE Professional Group on}, keywords = {Computational modeling,Computer Simulation,Contracts,Equations,Information systems,Laboratories,Mechanical factors,Organizing,TV,Tellurium,Time measurement,Transforms,mscthesis}, mendeley-tags = {Computational modeling,Computer Simulation,Contracts,Equations,Information systems,Laboratories,Mechanical factors,Organizing,TV,Tellurium,Time measurement,Transforms,mscthesis}, month = sep, number = {4}, pages = {76--84}, title = {{Simulation of self-organizing systems by digital computer}}, url = {http://ieeexplore.ieee.org/ielx5/4547596/22771/01057468.pdf?tp=\&arnumber=1057468\&isnumber=22771 http://ieeexplore.ieee.org/xpls/abs\_all.jsp?arnumber=1057468\&tag=1 http://ieeexplore.ieee.org/xpls/abs\_all.jsp?arnumber=1057468}, volume = {4}, year = {1954} }  @phdthesis{Freeman2014, abstract = {The performance of a classifier is affected by a number of factors including classifiertype, the input features and the desired output. This thesis examines the impact of featureselection and classification problem division on classification accuracy and complexity.Proper feature selection can reduce classifier size and improve classifier performanceby minimizing the impact of noisy, redundant and correlated features. Noisy features cancause false association between the features and the classifier output. Redundant andcorrelated features increase classifier complexity without adding additional information.Output selection or classification problem division describes the division of a large clas-sification problem into a set of smaller problems. Problem division can improve accuracyby allocating more resources to more difficult class divisions and enabling the use of morespecific feature sets for each sub-problem.The first part of this thesis presents two methods for creating feature-selected hierarchi-cal classifiers. The feature-selected hierarchical classification method jointly optimizes thefeatures and classification tree-design using genetic algorithms. The multi-modal binarytree (MBT) method performs the class division and feature selection sequentially and tol-erates misclassifications in the higher nodes of the tree. This yields a piecewise separationfor classes that cannot be fully separated with a single classifier. Experiments show thatthe accuracy of MBT is comparable to other multi-class extensions, but with lower testtime. Furthermore, the accuracy of MBT is significantly higher on multi-modal data sets.The second part of this thesis focuses on input feature selection measures. A numberof filter-based feature subset evaluation measures are evaluated with the goal of assessingtheir performance with respect to specific classifiers. Although there are many featureselection measures proposed in literature, it is unclear which feature selection measuresare appropriate for use with different classifiers. Sixteen common filter-based measures aretested on 20 real and 20 artificial data sets, which are designed to probe for specific featureselection challenges. The strengths and weaknesses of each measure are discussed withrespect to the specific feature selection challenges in the artificial data sets, correlationwith classifier accuracy and their ability to identify known informative features.The results indicate that the best filter measure is classifier-specific. K-nearest neigh-bours classifiers work well with subset-based RELIEF, correlation feature selection or con-ditional mutual information maximization, whereas Fisher’s interclass separability criterionand conditional mutual information maximization work better for support vector machines.Based on the results of the feature selection experiments, two new filter-based measuresare proposed based on conditional mutual information maximization, which performs well but cannot identify dependent features in a set and does not include a check for corre-lated features. Both new measures explicitly check for dependent features and the secondmeasure also includes a term to discount correlated features. Both measures correctly iden-tify known informative features in the artificial data sets and correlate well with classifieraccuracy.The final part of this thesis examines the use of feature selection for time-series databy using feature selection to determine important individual time windows or key framesin the series. Time-series feature selection is used with the MBT algorithm to createclassification trees for time-series data. The feature selected MBT algorithm is tested ontwo human motion recognition tasks: full-body human motion recognition from joint angledata and hand gesture recognition from electromyography data. Results indicate that thefeature selected MBT is able to achieve high classification accuracy on the time-series datawhile maintaining a short test time.}, author = {Freeman, Cecille}, file = {:home/perellm1/.local/share/data/Mendeley Ltd./Mendeley Desktop/Downloaded/Freeman - 2014 - Feature selection and hierarchical classifier design with applications to human motion recognition.pdf:pdf}, keywords = {mscthesis}, mendeley-tags = {mscthesis}, pages = {335}, title = {{Feature selection and hierarchical classifier design with applications to human motion recognition}}, url = {https://ece.uwaterloo.ca/~dkulic/pubs/Freeman\_Cecille.pdf}, year = {2014} }  @article{Fukushima1980, abstract = {A neural network model for a mechanism of visual pattern recognition is proposed in this paper. The network is self-organized by "learning without a teacher", and acquires an ability to recognize stimulus patterns based on the geometrical similarity (Gestalt) of their shapes without affected by their positions. This network is given a nickname "neocognitron". After completion of self-organization, the network has a structure similar to the hierarchy model of the visual nervous system proposed by Hubel and Wiesel. The network consists of an input layer (photoreceptor array) followed by a cascade connection of a number of modular structures, each of which is composed of two layers of cells connected in a cascade. The first layer of each module consists of "S-cells', which show charac- teristics similar to simple cells or lower order hyper- complex cells, and the second layer consists of "C-cells" similar to complex cells or higher order hypercomplex cells. The afferent synapses to each S-cell have plasticity and are modifiable. The network has an ability of unsupervised learning: We do not need any "teacher" during the process of self- organization, and it is only needed to present a set of stimulus patterns repeatedly to the input layer of the network. The network has been simulated on a digital computer. After repetitive presentation of a set of stimulus patterns, each stimulus pattern has become to elicit an output only from one of the C-cells of the last layer, and conversely, this C-cell has become selectively responsive only to that stimulus pattern. That is, none of the C-cells of the last layer responds to more than one stimulus pattern. The response of the C-cells of the last layer is not affected by the pattern's position at all. Neither is it affected by a small change in shape nor in size of the stimulus pattern. 1.}, annote = {$\backslash$begin\{itemize\}$\backslash$item Reiteration of self-organized by ''learning without a teacher''$\backslash$item Similar structure to the hierarchy model of the visual nervous system proposed by Hubel and Wiesel.$\backslash$item Network structure:$\backslash$begin\{itemize\}$\backslash$item Input layer (photoreceptor array)$\backslash$item Cascade of modules each one with :$\backslash$begin\{itemize\}$\backslash$item S-cells: in the first layer Simple cells or lower order hypercomplex cells$\backslash$item C-cells: in the second layer Complex cells or higher order hypercomplex cells$\backslash$end\{itemize\}$\backslash$end\{itemize\}$\backslash$item Hubel and Wiesel : the neural network in the visual cortex has a hierarchy structure:$\backslash$begin\{itemize\}$\backslash$item LGB (Lageral Geniculate Body)$\backslash$item Simple cells$\backslash$item Complex cells$\backslash$item Lower order hypercomplex cells$\backslash$item Higher order hypercomplex cells$\backslash$end\{itemize\}$\backslash$item a cell in a higher stage generally has tendency to respond selectively to a more complicated feature of the stimulus pattern$\backslash$item we extend the hierarchy model of Hubel and Wiesel, and$\backslash$textbf\{hypothesize\} the existance of a similar hierarchy structure even in hte stages higher than hypercomplex cells.$\backslash$item In the last module, the receptive field of each C-cell becomes so large as to cover the whole area of input layer \$U\_0\$, and each C-plane is so determined as to have only one C-cell$\backslash$item The output of an S-cell in the \$k\_l\$-th S-plane in the l-th module is described below$\backslash$end\{itemize\}}, author = {Fukushima, Kunihiko}, file = {:home/perellm1/.local/share/data/Mendeley Ltd./Mendeley Desktop/Downloaded/Fukushima - 1980 - Neocognitron A Self-organizing Neural Network Model for a Mechanism of Pattern Recognition Unaffected by Shift in Pos.pdf:pdf}, keywords = {mscthesis,visual cortex}, mendeley-tags = {mscthesis,visual cortex}, title = {{Neocognitron: A Self-organizing Neural Network Model for a Mechanism of Pattern Recognition Unaffected by Shift in Position}}, volume = {202}, year = {1980} }  @book{Gabbay2006, abstract = {ELSEVIER SALE - ALL SCIENCE AND TECHNOLOGY BOOKS 50\% OFF - ONE WEEK ONLYPsychology is the study of thinking, and cognitive science is the interdisciplinary investigation of mind and intelligence that also includes philosophy, artificial intelligence, neuroscience, linguistics, and anthropology. In these investigations, many philosophical issues arise concerning methods and central concepts. The Handbook of Philosophy of Psychology and Cognitive Science contains 16 essays by leading philosophers of science that illuminate the nature of the theories and explanations used in the investigation of minds.Topics discussed include representation, mechanisms, reduction, perception, consciousness, language, emotions, neuroscience, and evolutionary psychology.Key Features- Comprehensive coverage of philosophy of psychology and cognitive science- Distinguished contributors: leading philosophers in this area- Contributions closely tied to relevant scientific research}, author = {Gabbay, Dov and Woods, John and Thagard, Paul}, file = {:share/imagedb/perellm1/references/Gabbay, Woods, Thagard\_2006\_Philosophy of Psychology and Cognitive Science A Volume of the Handbook of the Philosophy of Science Series.pdf:pdf}, isbn = {9780080466620}, keywords = {Computers / Intelligence (AI) \& Semantics,Psychology / Cognitive Psychology,Psychology / Cognitive Psychology \& Cognition,mscthesis}, language = {en}, mendeley-tags = {Computers / Intelligence (AI) \& Semantics,Psychology / Cognitive Psychology,Psychology / Cognitive Psychology \& Cognition,mscthesis}, month = oct, pages = {524}, publisher = {Elsevier}, shorttitle = {Philosophy of Psychology and Cognitive Science}, title = {{Philosophy of Psychology and Cognitive Science: A Volume of the Handbook of the Philosophy of Science Series}}, url = {http://books.google.fi/books?id=Lp93PtrvM0MC http://books.google.com/books?hl=en\&lr=\&id=Lp93PtrvM0MC\&oi=fnd\&pg=PP2\&dq=Philosophy+of+Psychology+and+Cognitive+Science:+A+Volume+of+the+Handbook+of+the+Philosophy+of+Science+Series\&ots=HuiLyJm7Rb\&sig=DzZqkzvXzx2NFeUo2fLTcu3ptZQ}, year = {2006} }  @incollection{Garson2012, abstract = {Connectionism is a movement in cognitive science which hopes toexplain human intellectual abilities using artificial neural networks(also known as ‘neural networks’ or ‘neuralnets’). Neural networks are simplified models of the braincomposed of large numbers of units (the analogs of neurons) togetherwith weights that measure the strength of connections between theunits. These weights model the effects of the synapses that link oneneuron to another. Experiments on models of this kind have demonstratedan ability to learn such skills as face recognition, reading, and thedetection of simple grammatical structure., Philosophers have become interested in connectionism because itpromises to provide an alternative to the classical theory of the mind:the widely held view that the mind is something akin to a digitalcomputer processing a symbolic language. Exactly how and to what extentthe connectionist paradigm constitutes a challenge to classicism hasbeen a matter of hot debate in recent years.}, author = {Garson, James}, edition = {Winter 201}, editor = {Zalta, Edward N.}, file = {:home/perellm1/.local/share/data/Mendeley Ltd./Mendeley Desktop/Downloaded/FK3V8854/connectionism.html:html}, keywords = {artificial intelligence,language of thought hypothesis,mental representation,mscthesis}, mendeley-tags = {artificial intelligence,language of thought hypothesis,mental representation,mscthesis}, title = {{Connectionism}}, url = {http://plato.stanford.edu/archives/win2012/entries/connectionism/}, year = {2012} }  @article{Goodfellow2013c, abstract = {We consider the problem of designing models to leverage a recently introduced approximate model averaging technique called dropout. We define a simple new model called maxout (so named because its output is the max of a set of inputs, and because it is a natural companion to dropout) designed to both facilitate optimization by dropout and improve the accuracy of dropout's fast approximate model averaging technique. We empirically verify that the model successfully accomplishes both of these tasks. We use maxout and dropout to demonstrate state of the art classification performance on four benchmark datasets: MNIST, CIFAR-10, CIFAR-100, and SVHN.}, archiveprefix = {arXiv}, arxivid = {1302.4389}, author = {Goodfellow, Ian J. and Warde-Farley, David and Mirza, Mehdi and Courville, Aaron and Bengio, Yoshua}, eprint = {1302.4389}, file = {:share/imagedb/perellm1/references/Goodfellow et al.\_2013\_Maxout Networks.pdf:pdf}, keywords = {mscthesis}, mendeley-tags = {mscthesis}, month = feb, pages = {1319--1327}, title = {{Maxout Networks}}, url = {http://arxiv.org/abs/1302.4389}, year = {2013} }  @article{Goodfellow2013a, abstract = {Recognizing arbitrary multi-character text in unconstrained natural photographs is a hard problem. In this paper, we address an equally hard sub-problem in this domain viz. recognizing arbitrary multi-digit numbers from Street View imagery. Traditional approaches to solve this problem typically separate out the localization, segmentation, and recognition steps. In this paper we propose a unified approach that integrates these three steps via the use of a deep convolutional neural network that operates directly on the image pixels. We employ the DistBelief implementation of deep neural networks in order to train large, distributed neural networks on high quality images. We find that the performance of this approach increases with the depth of the convolutional network, with the best performance occurring in the deepest architecture we trained, with eleven hidden layers. We evaluate this approach on the publicly available SVHN dataset and achieve over 96\% accuracy in recognizing complete street numbers. We show that on a per-digit recognition task, we improve upon the state-of-the-art, achieving 97.84\% accuracy. We also evaluate this approach on an even more challenging dataset generated from Street View imagery containing several tens of millions of street number annotations and achieve over 90\% accuracy. To further explore the applicability of the proposed system to broader text recognition tasks, we apply it to synthetic distorted text from reCAPTCHA. reCAPTCHA is one of the most secure reverse turing tests that uses distorted text to distinguish humans from bots. We report a 99.8\% accuracy on the hardest category of reCAPTCHA. Our evaluations on both tasks indicate that at specific operating thresholds, the performance of the proposed system is comparable to, and in some cases exceeds, that of human operators.}, archiveprefix = {arXiv}, arxivid = {arXiv:1312.6082v4}, author = {Goodfellow, IJ and Bulatov, Yaroslav and Ibarz, Julian and Arnoud, Sacha and Shet, Vinay}, eprint = {arXiv:1312.6082v4}, file = {:home/perellm1/.local/share/data/Mendeley Ltd./Mendeley Desktop/Downloaded/Goodfellow et al. - 2013 - Multi-digit Number Recognition from Street View Imagery using Deep Convolutional Neural Networks.pdf:pdf}, journal = {arXiv preprint arXiv: \ldots}, keywords = {mscthesis}, mendeley-tags = {mscthesis}, pages = {1--13}, title = {{Multi-digit Number Recognition from Street View Imagery using Deep Convolutional Neural Networks}}, url = {http://arxiv.org/abs/1312.6082}, year = {2013} }  @article{Hao2012, abstract = {In this paper, we consider the problem of modeling complex texture information using undirected probabilistic graphical models. Texture is a special type of data that one can better understand by considering its local structure. For that purpose, we propose a convolutional variant of the Gaussian gated Boltzmann machine (GGBM) [12], inspired by the co-occurrence matrix in traditional texture analysis. We also link the proposed model to a much simpler Gaussian restricted Boltzmann machine where convolutional features are computed as a preprocessing step. The usefulness of the model is illustrated in texture classification and reconstruction experiments.}, author = {Hao, Tele and Raiko, Tapani and Ilin, Alexander and Karhunen, Juha}, file = {:home/perellm1/.local/share/data/Mendeley Ltd./Mendeley Desktop/Downloaded/Hao et al. - 2012 - Gated boltzmann machine in texture modeling.pdf:pdf}, journal = {\ldots Neural Networks and Machine \ldots}, keywords = {color,deep learn-,gated boltzmann machine,gaussian restricted boltzmann machine,ing,mscthesis,texture analysis}, mendeley-tags = {color,mscthesis}, title = {{Gated boltzmann machine in texture modeling}}, url = {http://link.springer.com/chapter/10.1007/978-3-642-33266-1\_16 http://research.ics.aalto.fi/publications/bibdb2012/public\_pdfs/icann12hao.pdf}, year = {2012} }  @book{Hartley1749, author = {Hartley, David}, file = {:share/imagedb/perellm1/references/Hartley\_1749\_Observations on man, his frame, his duty, and his expectations.pdf:pdf}, keywords = {Philosophy,mscthesis}, language = {eng}, mendeley-tags = {Philosophy,mscthesis}, pages = {1052}, publisher = {00 L : Scholars' Facsimiles and Reprints}, shorttitle = {Observations on man}, title = {{Observations on man, his frame, his duty, and his expectations}}, url = {http://archive.org/details/observationsonma00hart http://books.google.com/books?hl=en\&lr=\&id=OVwPAAAAIAAJ\&oi=fnd\&pg=PA1\&dq=Observations+on+man::+his+frame,his+duty,and+his+expectations\&ots=YXJ2WbpEIm\&sig=WBWeVrZOlwK1jYCyK46IAkJw-Ww}, year = {1749} }  @book{Hebb1949, address = {New York}, author = {Hebb, Donald Olding}, file = {:share/imagedb/perellm1/references/Hebb\_1949\_The Orgamization of Behavior a Neuropsychological Theory.pdf:pdf}, isbn = {0805843000}, keywords = {mscthesis}, mendeley-tags = {mscthesis}, publisher = {John Wiley \& Sons Inc.}, title = {{The Orgamization of Behavior a Neuropsychological Theory}}, year = {1949} }  @article{Hinton1995, abstract = {An unsupervised learning algorithm for a multilayer network of stochastic neurons is described. Bottom-up "recognition" connections convert the input into representations in successive hidden layers, and top-down "generative" connections reconstruct the representation in one layer from the representation in the layer above. In the "wake" phase, neurons are driven by recognition connections, and generative connections are adapted to increase the probability that they would reconstruct the correct activity vector in the layer below. In the "sleep" phase, neurons are driven by generative connections, and recognition connections are adapted to increase the probability that they would produce the correct activity vector in the layer above.}, author = {Hinton, G E and Dayan, P and Frey, B J and Neal, R M}, file = {:home/perellm1/.local/share/data/Mendeley Ltd./Mendeley Desktop/Downloaded/Hinton et al. - 1995 - The wake-sleep algorithm for unsupervised neural networks.pdf:pdf}, issn = {0036-8075}, journal = {Science (New York, N.Y.)}, keywords = {Algorithms,Neural Networks (Computer),Probability,Stochastic Processes,mscthesis}, mendeley-tags = {mscthesis}, month = may, number = {5214}, pages = {1158--61}, pmid = {7761831}, title = {{The "wake-sleep" algorithm for unsupervised neural networks.}}, url = {http://www.ncbi.nlm.nih.gov/pubmed/7761831}, volume = {268}, year = {1995} }  @article{Hinton2006, author = {Hinton, GE and Osindero, S and Teh, YW}, file = {:home/perellm1/.local/share/data/Mendeley Ltd./Mendeley Desktop/Downloaded/Hinton, Osindero, Teh - 2006 - A fast learning algorithm for deep belief nets(2).pdf:pdf}, journal = {Neural computation}, keywords = {mscthesis}, mendeley-tags = {mscthesis}, pages = {1527--1554}, title = {{A fast learning algorithm for deep belief nets}}, url = {http://www.mitpressjournals.org/doi/abs/10.1162/neco.2006.18.7.1527}, volume = {1554}, year = {2006} }  @article{Hinton2012b, abstract = {When a large feedforward neural network is trained on a small training set, it typically performs poorly on held-out test data. This "overfitting" is greatly reduced by randomly omitting half of the feature detectors on each training case. This prevents complex co-adaptations in which a feature detector is only helpful in the context of several other specific feature detectors. Instead, each neuron learns to detect a feature that is generally helpful for producing the correct answer given the combinatorially large variety of internal contexts in which it must operate. Random "dropout" gives big improvements on many benchmark tasks and sets new records for speech and object recognition.}, annote = {$\backslash$begin\{itemize\}$\backslash$item Paper about Dropout$\backslash$item Standard way to reduce test error$\backslash$begin\{itemize\}$\backslash$item averaging different models$\backslash$item Computationally expensive in training and test$\backslash$end\{itemize\}$\backslash$item Dropout$\backslash$begin\{itemize\}$\backslash$item Small training set$\backslash$item Prevents overfitting''$\backslash$item They use \$50\backslash\%\\backslash$item Instead of L2 norm, they set an upper bound for each individual neuron.$\backslash$item Mean network : At test time divide all the outgoing weights by 2 to compensate dropout$\backslash$item Specific case$\backslash$begin\{itemize\}$\backslash$item Single hidden layer network$\backslash$item N hidden units$\backslash$item Softmax'' output$\backslash$item \$50\backslash\%\$dropout$\backslash$item during test using mean network$\backslash$item Exactly equivalent to taking the geometric mean of the probability distributions over labels predicted by all \$2\^{}N\$possible networks$\backslash$end\{itemize\}$\backslash$end\{itemize\}$\backslash$item Results$\backslash$begin\{itemize\}$\backslash$item MNIST$\backslash$begin\{itemize\}$\backslash$item No dropout : 160 errors$\backslash$item Dropbout : 130 errors$\backslash$item Dropout + rm random \$20\backslash\%\$pixels : 110 errors$\backslash$item Deep Boltzmann machine : 88 errors$\backslash$item + Dropout : 79 errors$\backslash$figuremacro\{figures/Hinton2012\_fig5\}\{Visualization of features learned by first layer hidden units\}\{left without dropout and right using dropout\}$\backslash$end\{itemize\}$\backslash$item TIMIT$\backslash$begin\{itemize\}$\backslash$item 4 Fully-connected hidden layers 4000 units per layer$\backslash$item + 185 softmax'' output units$\backslash$item Without dropout : \$22.7\backslash\%\\backslash$item Dropout on hidden units : \$19.7\backslash\%\\backslash$end\{itemize\}$\backslash$item CIFAR-10$\backslash$begin\{itemize\}$\backslash$item Best published : \$18.5\backslash\%\\backslash$item 3 Conv+Max-pool 1 Fully : \$16.6\backslash\%\\backslash$item + Dropout in last hidden layer : \$15.6\backslash\%\\backslash$end\{itemize\}$\backslash$item ImageNet$\backslash$begin\{itemize\}$\backslash$item Average of 6 separate models : \$47.2\backslash\%\\backslash$item state-of-the-art : \$45.7\backslash\%\\backslash$item 5 Conv+Max-pool$\backslash$item + 2 Fully$\backslash$item + 1000 softmax''$\backslash$item Without dropout : \$48.6\backslash\%\\backslash$item Dropout in the 6th : \$42.4\backslash\%\\backslash$end\{itemize\}$\backslash$item Reuters$\backslash$begin\{itemize\}$\backslash$item 2 fully of 2000 hidden units$\backslash$item Without dropout : \$31.05\backslash\%\\backslash$item Dropout : \$29.62\backslash\%\\backslash$end\{itemize\}$\backslash$end\{itemize\}$\backslash$end\{itemize\}}, archiveprefix = {arXiv}, arxivid = {arXiv:1207.0580v1}, author = {Hinton, GE and Srivastava, N and Krizhevsky, Alex and Sutskever, I and Salakhutdinov, RR}, eprint = {arXiv:1207.0580v1}, file = {:home/perellm1/.local/share/data/Mendeley Ltd./Mendeley Desktop/Downloaded/Hinton et al. - 2012 - Improving neural networks by preventing co-adaptation of feature detectors.pdf:pdf}, journal = {arXiv preprint arXiv: \ldots}, keywords = {mscthesis}, mendeley-tags = {mscthesis}, pages = {1--18}, title = {{Improving neural networks by preventing co-adaptation of feature detectors}}, url = {http://arxiv.org/abs/1207.0580 http://arxiv.org/pdf/1207.0580v1.pdf}, year = {2012} }  @article{Huang2006, author = {Huang, Guang-Bin and Zhu, Qin-Yu and Siew, Chee-Kheong}, doi = {10.1016/j.neucom.2005.12.126}, file = {:home/perellm1/.local/share/data/Mendeley Ltd./Mendeley Desktop/Downloaded/Huang, Zhu, Siew - 2006 - Extreme learning machine Theory and applications.pdf:pdf}, issn = {09252312}, journal = {Neurocomputing}, keywords = {back-propagation algorithm,extreme learning machine,feedforward neural networks,mscthesis,random,real-time learning,support vector machine}, mendeley-tags = {mscthesis}, month = dec, number = {1-3}, pages = {489--501}, title = {{Extreme learning machine: Theory and applications}}, url = {http://linkinghub.elsevier.com/retrieve/pii/S0925231206000385}, volume = {70}, year = {2006} }  @article{Hubel1968, abstract = {1. The striate cortex was studied in lightly anaesthetized macaque and spider monkeys by recording extracellularly from single units and stimulating the retinas with spots or patterns of light. Most cells can be categorized as simple, complex, or hypercomplex, with response properties very similar to those previously described in the cat. On the average, however, receptive fields are smaller, and there is a greater sensitivity to changes in stimulus orientation. A small proportion of the cells are colour coded.2. Evidence is presented for at least two independent systems of columns extending vertically from surface to white matter. Columns of the first type contain cells with common receptive-field orientations. They are similar to the orientation columns described in the cat, but are probably smaller in cross-sectional area. In the second system cells are aggregated into columns according to eye preference. The ocular dominance columns are larger than the orientation columns, and the two sets of boundaries seem to be independent.3. There is a tendency for cells to be grouped according to symmetry of responses to movement; in some regions the cells respond equally well to the two opposite directions of movement of a line, but other regions contain a mixture of cells favouring one direction and cells favouring the other.4. A horizontal organization corresponding to the cortical layering can also be discerned. The upper layers (II and the upper two-thirds of III) contain complex and hypercomplex cells, but simple cells are virtually absent. The cells are mostly binocularly driven. Simple cells are found deep in layer III, and in IV A and IV B. In layer IV B they form a large proportion of the population, whereas complex cells are rare. In layers IV A and IV B one finds units lacking orientation specificity; it is not clear whether these are cell bodies or axons of geniculate cells. In layer IV most cells are driven by one eye only; this layer consists of a mosaic with cells of some regions responding to one eye only, those of other regions responding to the other eye. Layers V and VI contain mostly complex and hypercomplex cells, binocularly driven.5. The cortex is seen as a system organized vertically and horizontally in entirely different ways. In the vertical system (in which cells lying along a vertical line in the cortex have common features) stimulus dimensions such as retinal position, line orientation, ocular dominance, and perhaps directionality of movement, are mapped in sets of superimposed but independent mosaics. The horizontal system segregates cells in layers by hierarchical orders, the lowest orders (simple cells monocularly driven) located in and near layer IV, the higher orders in the upper and lower layers.}, author = {Hubel, DH and Wiesel, TN}, file = {:home/perellm1/.local/share/data/Mendeley Ltd./Mendeley Desktop/Downloaded/Hubel, Wiesel - 1968 - Receptive fields and functional architecture of monkey striate cortex.pdf:pdf}, journal = {The Journal of physiology}, keywords = {mscthesis,visual cortex}, mendeley-tags = {mscthesis,visual cortex}, pages = {215--243}, title = {{Receptive fields and functional architecture of monkey striate cortex}}, url = {http://www.ncbi.nlm.nih.gov/pmc/articles/PMC1359523/pdf/jphysiol01247-0121.pdf}, year = {1968} }  @article{Hyvarinen2000b, abstract = {Olshausen and Field (1996) applied the principle of independence maximization by sparse coding to extract features from natural images. This leads to the emergence of oriented linear filters that have simultaneous localization in space and in frequency, thus resembling Gabor functions and simple cell receptive fields. In this article, we show that the same principle of independence maximization can explain the emergence of phase- and shift-invariant features, similar to those found in complex cells. This new kind of emergence is obtained by maximizing the independence between norms of projections on linear subspaces (instead of the independence of simple linear filter outputs). The norms of the projections on such “independent feature subspaces” then indicate the values of invariant features.}, author = {Hyv\"{a}rinen, A and Hoyer, Patrik}, file = {:home/perellm1/.local/share/data/Mendeley Ltd./Mendeley Desktop/Downloaded/Hyv\"{a}rinen, Hoyer - 2000 - Emergence of phase-and shift-invariant features by decomposition of natural images into independent feature s.pdf:pdf}, journal = {Neural computation}, keywords = {mscthesis}, mendeley-tags = {mscthesis}, pages = {1705--1720}, title = {{Emergence of phase-and shift-invariant features by decomposition of natural images into independent feature subspaces}}, url = {http://www.mitpressjournals.org/doi/abs/10.1162/089976600300015312}, volume = {1720}, year = {2000} }  @article{Karpathy, abstract = {Convolutional Neural Networks (CNNs) have been es-tablished as a powerful class of models for image recog-nition problems. Encouraged by these results, we pro-vide an extensive empirical evaluation of CNNs on large- scale video classification using a new dataset of 1 millionYouTube videos belonging to 487 classes. We study mul-tiple approaches for extending the connectivity of a CNNin time domain to take advantage of local spatio-temporalinformation and suggest a multiresolution, foveated archi-tecture as a promising way of speeding up the training.Our best spatio-temporal networks display significant per-formance improvements compared to strong feature-basedbaselines (55.3\% to 63.9\%), but only a surprisingly mod-est improvement compared to single-frame models (59.3\%to 60.9\%). We further study the generalization performanceof our best model by retraining the top layers on the UCF-101 Action Recognition dataset and observe significant per-formance improvements compared to the UCF-101 baselinemodel (63.3\% up from 43.9\%).}, annote = {$\backslash$begin\{itemize\}$\backslash$item Compare different CNN architectures for video classification$\backslash$item Create a new dataset with 1 million of YouTube sport videos and 487 classes$\backslash$item They required one month of training$\backslash$item Multiresolution CNNs: New CNN with low resolution context and high resolution center$\backslash$begin\{itemize\}$\backslash$item Context stream: seems to learn color filters$\backslash$item Fovea stream: learns grayscale features$\backslash$end\{itemize\}$\backslash$item Compare with and without pretraining on other dataset UCF-101$\backslash$item Architectures (increasing spatio-temporal relations)$\backslash$begin\{itemize\}$\backslash$item Single frame: Classify with one single shot$\backslash$item Late Fusion: Classify with separate-in-time shots$\backslash$item Early Fusion: Classify with adjacent shots merging on first convolution layer$\backslash$item Slow Fusion: Classify with adjacent shots progressively mergin in upper layers$\backslash$end\{itemize\}$\backslash$item Results (best models):$\backslash$begin\{itemize\}$\backslash$item clip Hit, Video Hit, Video Hit top5$\backslash$item 42.4 60.0 78.5 Single-Frame + Multiresolution$\backslash$item 41.9 60.9 80.2 Slow Fusion$\backslash$end\{itemize\}$\backslash$item Results on UCF-101 with pretraining:$\backslash$begin\{itemize\}$\backslash$item 41.3 No pretraining$\backslash$item 64.1 Fine-tune top layer$\backslash$item 65.4 Fine-tune top 3 layers$\backslash$item 62.2 Fine-tune all layers$\backslash$end\{itemize\}$\backslash$item Conclusions:$\backslash$begin\{itemize\}$\backslash$item From video classification can be derived that camera movements deteriorate the predictions$\backslash$item Single frame gives very good results$\backslash$end\{itemize\}$\backslash$item Further work:$\backslash$begin\{itemize\}$\backslash$item Apply some filter for camera movements$\backslash$item Explore RNN from clip-level into video-level$\backslash$end\{itemize\}$\backslash$end\{itemize\}}, author = {Karpathy, Andrej and Toderici, G and Shetty, S and Leung, Thomas and Sukthankar, Rahul and Fei-Fei, Li}, file = {:home/perellm1/.local/share/data/Mendeley Ltd./Mendeley Desktop/Downloaded/Karpathy et al. - 2014 - Large-scale Video Classification with Convolutional Neural Networks.pdf:pdf}, journal = {vision.stanford.edu}, keywords = {mscthesis}, mendeley-tags = {mscthesis}, title = {{Large-scale Video Classification with Convolutional Neural Networks}}, url = {http://vision.stanford.edu/pdf/karpathy14.pdf}, year = {2014} }  @article{Kavukcuoglu2010, abstract = {We propose an unsupervised method for learning multi-stage hierarchies of sparseconvolutional features. While sparse coding has become an increasingly popularmethod for learning visual features, it is most often trained at the patch level.Applying the resulting filters convolutionally results in highly redundant codesbecause overlapping patches are encoded in isolation. By training convolutionallyover large image windows, our method reduces the redudancy between featurevectors at neighboring locations and improves the efficiency of the overall repre-sentation. In addition to a linear decoder that reconstructs the image from sparsefeatures, our method trains an efficient feed-forward encoder that predicts quasi-sparse features from the input. While patch-based training rarely produces any-thing but oriented edge detectors, we show that convolutional training produceshighly diverse filters, including center-surround filters, corner detectors, cross de-tectors, and oriented grating detectors. We show that using these filters in multi-stage convolutional network architecture improves performance on a number ofvisual recognition and detection tasks}, author = {Kavukcuoglu, Koray and Sermanet, Pierre and Boureau, Y-lan and LeCun, Yann and Gregor, Karol and Mathieu, Micha\"{e}l}, file = {:home/perellm1/.local/share/data/Mendeley Ltd./Mendeley Desktop/Downloaded/Kavukcuoglu et al. - 2010 - Learning Convolutional Feature Hierarchies for Visual Recognition.pdf:pdf}, journal = {NIPS}, keywords = {CNN,mscthesis}, mendeley-tags = {CNN,mscthesis}, number = {1}, pages = {1--9}, title = {{Learning Convolutional Feature Hierarchies for Visual Recognition}}, url = {https://papers.nips.cc/paper/4133-learning-convolutional-feature-hierarchies-for-visual-recognition.pdf}, year = {2010} }  @article{King2002, abstract = {It seems that everywhere you look there is some article or discussion about color management. Why all the fuss? Do I need to management my colors? We have been creating colored artifacts for a very long time and I don't think we have needed color management. So why now? Most of these discussions also refer to the ICC. What is that? These and other questions will be answered in a straightforward manner in plain English. Adobe Systems has pioneered the use of desktop computers for color work, and the author has helped Adobe pick its way down conflicting color paths with confusing road signs over the last 10 years.}, author = {King, JC}, file = {:home/perellm1/.local/share/data/Mendeley Ltd./Mendeley Desktop/Downloaded/King - 2002 - Why color management.pdf:pdf}, journal = {9th Congress of the International Color \ldots}, keywords = {color,mscthesis}, mendeley-tags = {color,mscthesis}, title = {{Why color management?}}, url = {http://proceedings.spiedigitallibrary.org/proceeding.aspx?articleid=886641 https://noppa.aalto.fi/noppa/kurssi/t-75.2122/harjoitustyot/T-75\_2122\_why\_color\_management.pdf}, year = {2002} }  @article{Lindeberg1998, abstract = {The fact that objects in the world appear in different ways depending on the scale of observation has important implications if one aims at describing them. It shows that the notion of scale is of utmost importance when processing unknown measurement data by automatic methods. In their seminal works, Witkin (1983) and Koenderink (1984) proposed to approach this problem by representing image structures at different scales in a so-called scale-space representation. Traditional scale-space theory building on this work, however, does not address the problem of how to select local appropriate scales for further analysis. This article proposes a systematic methodology for dealing with this problem. A framework is presented for generating hypotheses about interesting scale levels in image data, based on a general principle stating that local extrema over scales of different combinations of$\gamma-normalized derivatives are likely candidates to correspond to interesting structures. Specifically, it is shown how this idea can be used as a major mechanism in algorithms for automatic scale selection, which adapt the local scales of processing to the local image structure. Support for the proposed approach is given in terms of a general theoretical investigation of the behaviour of the scale selection method under rescalings of the input pattern and by integration with different types of early visual modules, including experiments on real-world and synthetic data. Support is also given by a detailed analysis of how different types of feature detectors perform when integrated with a scale selection mechanism and then applied to characteristic model patterns. Specifically, it is described in detail how the proposed methodology applies to the problems of blob detection, junction detection, edge detection, ridge detection and local frequency estimation. In many computer vision applications, the poor performance of the low-level vision modules constitutes a major bottleneck. It is argued that the inclusion of mechanisms for automatic scale selection is essential if we are to construct vision systems to automatically analyse complex unknown environments.}, author = {Lindeberg, Tony}, doi = {10.1023/A:1008045108935}, file = {:home/perellm1/.local/share/data/Mendeley Ltd./Mendeley Desktop/Downloaded/IPBAII2X/Lindeberg - 1998 - Feature Detection with Automatic Scale Selection.pdf:pdf;:home/perellm1/.local/share/data/Mendeley Ltd./Mendeley Desktop/Downloaded/2XG5VRSN/A1008045108935.html:html}, issn = {0920-5691, 1573-1405}, journal = {International journal of computer vision}, keywords = {Artificial Intelligence (incl. Robotics),Automation and Robotics,Computer Imaging- Graphics and Computer Vision,Computer vision,Gaussian derivative,Image Processing,blob detection,corner detection,feature detection,frequency estimation,mscthesis,multi-scale representation,normalized derivative,scale,scale selection,scale-space}, language = {en}, mendeley-tags = {Artificial Intelligence (incl. Robotics),Automation and Robotics,Computer Imaging- Graphics and Computer Vision,Computer vision,Gaussian derivative,Image Processing,blob detection,corner detection,feature detection,frequency estimation,mscthesis,multi-scale representation,normalized derivative,scale,scale selection,scale-space}, month = nov, number = {2}, pages = {79--116}, title = {{Feature detection with automatic scale selection}}, url = {http://link.springer.com/article/10.1023/A:1008045108935 http://link.springer.com/content/pdf/10.1023/A:1008045108935.pdf}, volume = {30}, year = {1998} }  @article{Little1975, abstract = {We present a theory of short, intermediate and long term memory of a neural network incorporating the known statistical nature of chemical transmission at the synapses. Correlated pre- and post-synaptic facilitation (related to Hebb's Hypothesis) on three time scales are crucial to the model. Considerable facilitation is needed on a short time scale both for establishing short term memory (active persistent firing pattern for the order of a sec) and the recall of intermediate and long term memory (latent capability for a pattern to be re-excited). Longer lasting residual facilitation and plastic changes (of the same nature as the short term changes) provide the mechanism for imprinting of the intermediate and long term memory. We discuss several interesting features of our theory: nonlocal memory storage, large storage capacity, access of memory, single memory mechanism, robustness of the network and statistical reliability, and usefulness of statistical fluctuations.}, author = {Little, W. A. and Shaw, Gordon L.}, doi = {10.1016/S0091-6773(75)90122-4}, file = {:home/perellm1/.local/share/data/Mendeley Ltd./Mendeley Desktop/Downloaded/9AHC8TQJ/S0091677375901224.html:html}, issn = {0091-6773}, journal = {Behavioral Biology}, keywords = {mscthesis}, mendeley-tags = {mscthesis}, month = jun, number = {2}, pages = {115--133}, title = {{A statistical theory of short and long term memory}}, url = {http://www.sciencedirect.com/science/article/pii/S0091677375901224}, volume = {14}, year = {1975} }  @book{Locke1700, abstract = {Many of the earliest books, particularly those dating back to the 1900s and before, are now extremely scarce and increasingly expensive. Pomona Press are republishing these classic works in affordable, high quality, modern editions, using the original text and artwork.}, author = {Locke, John}, isbn = {9781443733342}, keywords = {Literary Collections / Essays,Philosophy / General,mscthesis}, language = {en}, mendeley-tags = {Literary Collections / Essays,Philosophy / General,mscthesis}, month = nov, pages = {390}, publisher = {Read Books}, title = {{An essay concerning human understanding}}, url = {http://books.google.fi/books?id=hPFyvHvgOIAC http://books.google.com/books?hl=en\&lr=\&id=hGeKsjjtu6EC\&oi=fnd\&pg=PR2\&dq=An+Essay+Concerning+Human+Understanding\&ots=93ODeXVdCL\&sig=SNnx\_kGfxKdf1y8\_wv1867ha7Zc}, year = {1700} }  @incollection{Long2014, abstract = {Convolutional neural nets (convnets) trained from massive labeled datasets have substantially improved the state-of-the-art in image classification and object detection. However, visual understanding requires establishing correspondence on a finer level than object category. Given their large pooling regions and training from whole-image labels, it is not clear that convnets derive their success from an accurate correspondence model which could be used for precise localization. In this paper, we study the effectiveness of convnet activation features for tasks requiring correspondence. We present evidence that convnet features localize at a much finer scale than their receptive field sizes, that they can be used to perform intraclass aligment as well as conventional hand-engineered features, and that they outperform conventional features in keypoint prediction on objects from PASCAL VOC 2011.}, author = {Long, Jonathan L and Zhang, Ning and Darrell, Trevor}, editor = {Ghahramani, Z. and Welling, M. and Cortes, C. and Lawrence, N. D. and Weinberger, K. Q.}, file = {:home/perellm1/.local/share/data/Mendeley Ltd./Mendeley Desktop/Downloaded/3AGAP5Z6/Long et al. - 2014 - Do Convnets Learn Correspondence.pdf:pdf;:home/perellm1/.local/share/data/Mendeley Ltd./Mendeley Desktop/Downloaded/P4NTJVW9/5420-do-convnets-learn-correspondence.html:html}, keywords = {mscthesis}, mendeley-tags = {mscthesis}, pages = {1601--1609}, publisher = {Curran Associates, Inc.}, title = {{Do Convnets Learn Correspondence?}}, url = {http://papers.nips.cc/paper/5420-do-convnets-learn-correspondence.pdf http://papers.nips.cc/paper/5420-do-convnets-learn-correspondence}, year = {2014} }  @article{Malsburg1973, abstract = {A nerve net model for the visual cortex of higher vertebrates is presented. A simple learning procedure is shown to be sufficient for the organization of some essential functional properties of single units. The rather special assumptions usually made in the literature regarding preorganization of the visual cortex are thereby avoided. The model consists of 338 neurones forming a sheet analogous to the cortex. The neurones are connected randomly to a “retina” of 19 cells. Nine different stimuli in the form of light bars were applied. The afferent connections were modified according to a mechanism of synaptic training. After twenty presentations of all the stimuli individual cortical neurones became sensitive to only one orientation. Neurones with the same or similar orientation sensitivity tended to appear in clusters, which are analogous to cortical columns. The system was shown to be insensitive to a background of disturbing input excitations during learning. After learning it was able to repair small defects introduced into the wiring and was relatively insensitive to stimuli not used during training.}, author = {von der Malsburg, Chr}, doi = {10.1007/BF00288907}, file = {:home/perellm1/.local/share/data/Mendeley Ltd./Mendeley Desktop/Downloaded/WXZEB2ZK/BF00288907.html:html}, issn = {0023-5946, 1432-0770}, journal = {Kybernetik}, keywords = {Neurosciences,Zoology,mscthesis}, language = {en}, mendeley-tags = {Neurosciences,Zoology,mscthesis}, month = jun, number = {2}, pages = {85--100}, title = {{Self-organization of orientation sensitive cells in the striate cortex}}, url = {http://link.springer.com/article/10.1007/BF00288907}, volume = {14}, year = {1973} }  @article{Marszalek2009, abstract = {This paper exploits the context of natural dynamic scenes for human action recognition in video. Human actions are frequently constrained by the purpose and the physical properties of scenes and demonstrate high correlation with particular scene classes. For example, eating often happens in a kitchen while running is more common outdoors. The contribution of this paper is three-fold: (a) we automatically discover relevant scene classes and their correlation with human actions, (b) we show how to learn selected scene classes from video without manual supervision and (c) we develop a joint framework for action and scene recognition and demonstrate improved recognition of both in natural video. We use movie scripts as a means of automatic supervision for training. For selected action classes we identify correlated scene classes in text and then retrieve video samples of actions and scenes for training using script-to-video alignment. Our visual models for scenes and actions are formulated within the bag-of-features framework and are combined in a joint scene-action SVM-based classifier. We report experimental results and validate the method on a new large dataset with twelve action classes and ten scene classes acquired from 69 movies.}, author = {Marszalek, M and Laptev, Ivan and Schmid, Cordelia}, file = {:home/perellm1/.local/share/data/Mendeley Ltd./Mendeley Desktop/Downloaded/Marszalek, Laptev, Schmid - 2009 - Actions in context.pdf:pdf}, isbn = {9781424439911}, journal = {Computer Vision and \ldots}, keywords = {mscthesis}, mendeley-tags = {mscthesis}, number = {i}, pages = {2929--2936}, title = {{Actions in context}}, url = {http://ieeexplore.ieee.org/xpls/abs\_all.jsp?arnumber=5206557}, year = {2009} }  @article{Mikolajczyk2005b, abstract = {In this paper, we compare the performance of descriptors computed for local interest regions, as, for example, extracted by the Harris-Affine detector [Mikolajczyk, K and Schmid, C, 2004]. Many different descriptors have been proposed in the literature. It is unclear which descriptors are more appropriate and how their performance depends on the interest region detector. The descriptors should be distinctive and at the same time robust to changes in viewing conditions as well as to errors of the detector. Our evaluation uses as criterion recall with respect to precision and is carried out for different image transformations. We compare shape context [Belongie, S, et al., April 2002], steerable filters [Freeman, W and Adelson, E, Setp. 1991], PCA-SIFT [Ke, Y and Sukthankar, R, 2004], differential invariants [Koenderink, J and van Doorn, A, 1987], spin images [Lazebnik, S, et al., 2003], SIFT [Lowe, D. G., 1999], complex filters [Schaffalitzky, F and Zisserman, A, 2002], moment invariants [Van Gool, L, et al., 1996], and cross-correlation for different types of interest regions. We also propose an extension of the SIFT descriptor and show that it outperforms the original method. Furthermore, we observe that the ranking of the descriptors is mostly independent of the interest region detector and that the SIFT-based descriptors perform best. Moments and steerable filters show the best performance among the low dimensional descriptors.}, author = {Mikolajczyk, K. and Schmid, C.}, doi = {10.1109/TPAMI.2005.188}, file = {:home/perellm1/.local/share/data/Mendeley Ltd./Mendeley Desktop/Downloaded/NS6U36FJ/Mikolajczyk and Schmid - 2005 - A performance evaluation of local descriptors.pdf:pdf;:home/perellm1/.local/share/data/Mendeley Ltd./Mendeley Desktop/Downloaded/7SFR5MSB/abs\_all.html:html}, issn = {0162-8828}, journal = {Pattern Analysis and Machine \ldots}, keywords = {Algorithms,Computer Simulation,Data Interpretation- Statistical,Detectors,Filters,Image Enhancement,Image Interpretation- Computer-Assisted,Image databases,Image recognition,Image retrieval,Index Terms- Local descriptors,Information Storage and Retrieval,Information retrieval,Interest points,Layout,Models- Statistical,Pattern Recognition- Automated,Robustness,Software,Software Validation,Spatial databases,artificial intelligence,complex filters,correlation methods,cross-correlation,filtering theory,image classification,image matching,image transformations,interest regions,invariance,local descriptors,matching,moment invariants,mscthesis,performance evaluation,recognition.,spin images,steerable filters}, mendeley-tags = {Algorithms,Computer Simulation,Data Interpretation- Statistical,Detectors,Filters,Image Enhancement,Image Interpretation- Computer-Assisted,Image databases,Image recognition,Image retrieval,Index Terms- Local descriptors,Information Storage and Retrieval,Information retrieval,Interest points,Layout,Models- Statistical,Pattern Recognition- Automated,Robustness,Software,Software Validation,Spatial databases,artificial intelligence,complex filters,correlation methods,cross-correlation,filtering theory,image classification,image matching,image transformations,interest regions,invariance,local descriptors,matching,moment invariants,mscthesis,performance evaluation,recognition.,spin images,steerable filters}, month = oct, number = {10}, pages = {1615--1630}, title = {{A performance evaluation of local descriptors}}, url = {http://ieeexplore.ieee.org/ielx5/34/32189/01498756.pdf?tp=\&arnumber=1498756\&isnumber=32189 http://ieeexplore.ieee.org/xpls/abs\_all.jsp?arnumber=1498756\&tag=1 http://ieeexplore.ieee.org/xpls/abs\_all.jsp?arnumber=1498756}, volume = {27}, year = {2005} }  @article{Mikolajczyk2004, abstract = {In this paper we propose a novel approach for detecting interest points invariant to scale and affine transformations. Our scale and affine invariant detectors are based on the following recent results: (1) Interest points extracted with the Harris detector can be adapted to affine transformations and give repeatable results (geometrically stable). (2) The characteristic scale of a local structure is indicated by a local extremum over scale of normalized derivatives (the Laplacian). (3) The affine shape of a point neighborhood is estimated based on the second moment matrix. Our scale invariant detector computes a multi-scale representation for the Harris interest point detector and then selects points at which a local measure (the Laplacian) is maximal over scales. This provides a set of distinctive points which are invariant to scale, rotation and translation as well as robust to illumination changes and limited changes of viewpoint. The characteristic scale determines a scale invariant region for each point. We extend the scale invariant detector to affine invariance by estimating the affine shape of a point neighborhood. An iterative algorithm modifies location, scale and neighborhood of each point and converges to affine invariant points. This method can deal with significant affine transformations including large scale changes. The characteristic scale and the affine shape of neighborhood determine an affine invariant region for each point. We present a comparative evaluation of different detectors and show that our approach provides better results than existing methods. The performance of our detector is also confirmed by excellent matching results; the image is described by a set of scale/affine invariant descriptors computed on the regions associated with our points.}, author = {Mikolajczyk, Krystian and Schmid, Cordelia}, doi = {10.1023/B:VISI.0000027790.02288.f2}, file = {:home/perellm1/.local/share/data/Mendeley Ltd./Mendeley Desktop/Downloaded/IZZA85KB/Mikolajczyk and Schmid - 2004 - Scale \& Affine Invariant Interest Point Detectors.pdf:pdf;:home/perellm1/.local/share/data/Mendeley Ltd./Mendeley Desktop/Downloaded/JBFXDAAM/BVISI.0000027790.02288.html:html}, issn = {0920-5691, 1573-1405}, journal = {International journal of computer vision}, keywords = {Artificial Intelligence (incl. Robotics),Automation and Robotics,Computer Imaging- Graphics and Computer Vision,Image Processing,Interest points,Local features,affine invariance,matching,mscthesis,recognition,scale invariance}, language = {en}, mendeley-tags = {Artificial Intelligence (incl. Robotics),Automation and Robotics,Computer Imaging- Graphics and Computer Vision,Image Processing,Interest points,Local features,affine invariance,matching,mscthesis,recognition,scale invariance}, month = oct, number = {1}, pages = {63--86}, title = {{Scale \& affine invariant interest point detectors}}, url = {http://link.springer.com/article/10.1023/B:VISI.0000027790.02288.f2 http://link.springer.com/content/pdf/10.1023/B:VISI.0000027790.02288.f2.pdf}, volume = {60}, year = {2004} }  @article{Ngiam2011, abstract = {Deep networks have been successfully applied to unsupervised feature learning for single modalities (e.g., text, images or audio). In this work, we propose a novel application of deep networks to learn features over multiple modalities. We present a series of tasks for multimodal learning and show how to train deep networks that learn features to address these tasks. In particular, we demonstrate cross modality feature learning, where better features for one modality (e.g., video) can be learned ifmultiple modalities (e.g., audio and video) are present at feature learning time. Furthermore, we show how to learn a shared representation between modalities and evalu-ate it on a unique task, where the classifier is trained with audio-only data but tested with video-only data and vice-versa. Our mod-els are validated on the CUAVE and AVLetters datasets on audio-visual speech classification, demonstrating best published visual speech classification on AVLetters and effective shared representation learning.}, author = {Ngiam, Jiquan and Khosla, Aditya and Kim, Mingyu and Nam, Juhan and Lee, Honglak and Ng, Andrew Y.}, file = {:share/imagedb/perellm1/references/Ngiam et al.\_2011\_Multimodal deep learning.pdf:pdf}, journal = {International Conference on Machine Learning}, title = {{Multimodal deep learning}}, url = {http://machinelearning.wustl.edu/mlpapers/paper\_files/ICML2011Ngiam\_399.pdf}, volume = {28}, year = {2011} }  @article{Niebles2008, abstract = {We present a novel unsupervised learning method for human action categories. A video sequence is represented as a collection of spatial-temporal words by extracting space-time interest points. The algorithm automatically learns the probability distributions of the spatial-temporal words and the intermediate topics corresponding to human action categories. This is achieved by using latent topic models such as the probabilistic Latent Semantic Analysis (pLSA) model and Latent Dirichlet Allocation (LDA). Our approach can handle noisy feature points arisen from dynamic background and moving cameras due to the application of the probabilistic models. Given a novel video sequence, the algorithm can categorize and localize the human action(s) contained in the video. We test our algorithm on three challenging datasets: the KTH human motion dataset, the Weizmann human action dataset, and a recent dataset of figure skating actions. Our results reflect the promise of such a simple approach. In addition, our algorithm can recognize and localize multiple actions in long and complex video sequences containing multiple motions.}, author = {Niebles, Juan Carlos and Wang, Hongcheng and Fei-Fei, Li}, doi = {10.1007/s11263-007-0122-4}, file = {:home/perellm1/.local/share/data/Mendeley Ltd./Mendeley Desktop/Downloaded/Niebles, Wang, Fei-Fei - 2008 - Unsupervised Learning of Human Action Categories Using Spatial-Temporal Words.pdf:pdf}, issn = {0920-5691}, journal = {International Journal of Computer Vision}, keywords = {action categorization,bag of words,mscthesis,spatio-temporal interest points,topic models,unsupervised learning}, mendeley-tags = {mscthesis}, month = mar, number = {3}, pages = {299--318}, title = {{Unsupervised Learning of Human Action Categories Using Spatial-Temporal Words}}, url = {http://link.springer.com/10.1007/s11263-007-0122-4}, volume = {79}, year = {2008} }  @article{Ning2005, abstract = {We describe a trainable system for analyzing videos of developing C. elegans embryos. The system automatically detects, segments, and locates cells and nuclei in microscopic images. The system was designed as the central component of a fully automated phenotyping system. The system contains three modules 1) a convolutional network trained to classify each pixel into five categories: cell wall, cytoplasm, nucleus membrane, nucleus, outside medium; 2) an energy-based model, which cleans up the output of the convolutional network by learning local consistency constraints that must be satisfied by label images; 3) a set of elastic models of the embryo at various stages of development that are matched to the label images.}, author = {Ning, Feng and Delhomme, Damien and LeCun, Yann and Piano, Fabio and Bottou, L\'{e}on and Barbano, Paolo Emilio}, file = {:home/perellm1/.local/share/data/Mendeley Ltd./Mendeley Desktop/Downloaded/Ning et al. - 2005 - Toward automatic phenotyping of developing embryos from videos.pdf:pdf}, issn = {1057-7149}, journal = {IEEE transactions on image processing : a publication of the IEEE Signal Processing Society}, keywords = {Algorithms,Animals,Artificial Intelligence,Automated,Automated: methods,Caenorhabditis elegans,Caenorhabditis elegans: anatomy \& histology,Caenorhabditis elegans: classification,Caenorhabditis elegans: embryology,Caenorhabditis elegans: growth \& development,Computer-Assisted,Computer-Assisted: methods,Embryo,Fetal Development,Fetal Development: physiology,Image Enhancement,Image Enhancement: methods,Image Interpretation,Microscopy,Nonmammalian,Nonmammalian: cytology,Pattern Recognition,Phase-Contrast,Phase-Contrast: methods,Phenotype,Reproducibility of Results,Sensitivity and Specificity,Video,Video: methods,mscthesis}, mendeley-tags = {mscthesis}, month = sep, number = {9}, pages = {1360--71}, pmid = {16190471}, title = {{Toward automatic phenotyping of developing embryos from videos.}}, url = {http://www.ncbi.nlm.nih.gov/pubmed/16190471}, volume = {14}, year = {2005} }  @article{Nowozin2011, abstract = {Powerful statistical models that can be learned efficiently from large amounts of data are currently revolutionizing computer vision. These models possess a rich internal structure reflecting task-specific relations and constraints. This monograph introduces the reader to the most popular classes of structured models in computer vision. Our focus is discrete undirected graphical models which we cover in detail together with a description of algorithms for both probabilistic inference and maximum a posteriori inference. We discuss separately recently successful techniques for prediction in general structured models. In the second part of this monograph we describe methods for parameter learning where we distinguish the classic maximum likelihood based methods from the more recent prediction-based parameter learning methods. We highlight developments to enhance current models and discuss kernelized models and latent variable models. To make the monograph more practical and to provide links to further study we provide examples of successful application of many methods in the computer vision literature.}, author = {Nowozin, Sebastian and Lampert, CH}, doi = {10.1561/0600000033}, file = {:share/imagedb/perellm1/references/Nowozin, Lampert\_2011\_Structured learning and prediction in computer vision.pdf:pdf}, issn = {1572-2740}, journal = {\ldots and Trends® in Computer Graphics and Vision}, keywords = {mscthesis}, mendeley-tags = {mscthesis}, month = mar, number = {3\&\#8211;4}, pages = {185--365}, title = {{Structured learning and prediction in computer vision}}, url = {http://dx.doi.org/10.1561/0600000033 http://dl.acm.org/citation.cfm?id=2185834}, volume = {6}, year = {2011} }  @techreport{Parker1985, author = {Parker, D.}, institution = {Invention Report S81-64, File 1, Cambridge, MA: Center for Computational Research in Economics and Management Science, MIT.}, keywords = {mscthesis}, mendeley-tags = {mscthesis}, title = {{Learning-logic}}, year = {1982} }  @book{Pearl1997, abstract = {Probabilistic Reasoning in Intelligent Systems is a complete and accessible account of the theoretical foundations and computational methods that underlie plausible reasoning under uncertainty. The author provides a coherent explication of probability as a language for reasoning with partial belief and offers a unifying perspective on other AI approaches to uncertainty, such as the Dempster-Shafer formalism, truth maintenance systems, and nonmonotonic logic.The author distinguishes syntactic and semantic approaches to uncertainty--and offers techniques, based on belief networks, that provide a mechanism for making semantics-based systems operational. Specifically, network-propagation techniques serve as a mechanism for combining the theoretical coherence of probability theory with modern demands of reasoning-systems technology: modular declarative inputs, conceptually meaningful inferences, and parallel distributed computation. Application areas include diagnosis, forecasting, image interpretation, multi-sensor fusion, decision support systems, plan recognition, planning, speech recognition--in short, almost every task requiring that conclusions be drawn from uncertain clues and incomplete information.Probabilistic Reasoning in Intelligent Systems will be of special interest to scholars and researchers in AI, decision theory, statistics, logic, philosophy, cognitive psychology, and the management sciences. Professionals in the areas of knowledge-based systems, operations research, engineering, and statistics will find theoretical and computational tools of immediate practical use. The book can also be used as an excellent text for graduate-level courses in AI, operations research, or applied probability.}, author = {Pearl, Judea}, file = {:share/imagedb/perellm1/references/Pearl\_1997\_Probabilistic Reasoning in Intelligent Systems Networks of Plausible Inference.pdf:pdf}, isbn = {9781558604797}, keywords = {Computers / Intelligence (AI) \& Semantics,mscthesis}, language = {en}, mendeley-tags = {Computers / Intelligence (AI) \& Semantics,mscthesis}, pages = {576}, publisher = {Morgan Kaufmann}, shorttitle = {Probabilistic Reasoning in Intelligent Systems}, title = {{Probabilistic Reasoning in Intelligent Systems: Networks of Plausible Inference}}, url = {http://books.google.fi/books?id=AvNID7LyMusC}, year = {1997} }  @article{Petersen2008, author = {Petersen, KB and Pedersen, MS}, file = {:share/imagedb/perellm1/references/Petersen, Pedersen\_2008\_The matrix cookbook.pdf:pdf}, journal = {Technical University of Denmark}, keywords = {acknowledgements,christian rish\o j,contribu-,derivative of,derivative of inverse matrix,determinant,differentiate a matrix,douglas l,esben hoegh-,matrix algebra,matrix identities,matrix relations,mscthesis,thank the following for,theobald,tions and suggestions,we would like to}, mendeley-tags = {mscthesis}, pages = {1--56}, title = {{The matrix cookbook}}, url = {http://www.ece.mcmaster.ca/~reilly/ece712/matrix facts and identites.pdf}, year = {2008} }  @article{Rochester1956, abstract = {Theories by D.O. Hebb and P.M. Milner on how the brain works were tested by simulating neuron nets on the IBM Type 704 Electronic Calculator. The formation of cell assemblies from an unorganized net of neurons was demonstrated, as well as a plausible mechanism for short-term memory and the phenomena of growth and fractionation of cell assemblies. The cell assemblies do not yet act just as the theory requires, but changes in the theory and the simulation offer promise for further experimentation.}, author = {Rochester, N. and Holland, J.}, doi = {10.1109/TIT.1956.1056810}, file = {:home/perellm1/.local/share/data/Mendeley Ltd./Mendeley Desktop/Downloaded/T9CT97EZ/Rochester et al. - 1956 - Tests on a cell assembly theory of the action of t.pdf:pdf;:home/perellm1/.local/share/data/Mendeley Ltd./Mendeley Desktop/Downloaded/IIZSQN2F/abs\_all.html:html}, issn = {0096-1000}, journal = {Information Theory, IRE \ldots}, keywords = {Assembly,Brain,Brain modeling,Computational modeling,Electronic equipment testing,Fractionation,Laboratories,Neurons,Neurophysiology,Organisms,Psychology,mscthesis,neural networks}, mendeley-tags = {Assembly,Brain,Brain modeling,Computational modeling,Electronic equipment testing,Fractionation,Laboratories,Neurons,Neurophysiology,Organisms,Psychology,mscthesis,neural networks}, month = sep, number = {3}, pages = {80--93}, title = {{Tests on a cell assembly theory of the action of the brain, using a large digital computer}}, url = {http://ieeexplore.ieee.org/ielx5/4547527/22738/01056810.pdf?tp=\&arnumber=1056810\&isnumber=22738 http://ieeexplore.ieee.org/xpls/abs\_all.jsp?arnumber=1056810\&tag=1 http://ieeexplore.ieee.org/xpls/abs\_all.jsp?arnumber=1056810}, volume = {2}, year = {1956} }  @techreport{Rosenblatt1961, abstract = {Part I attempts to review the background, basic sources of data, concepts, and methodology to be employed in the study of perceptrons. In Chapter 2, a brief review of the main alternative approaches to the development of brain models is presented. Chapter 3 considers the physiological and psychological criteria for a suitable model, and attempts to evaluate the empirical evidence which is available on several important issues. Chapter 4 contains basic definitions and some of the notation to be used in later sections are presented. Parts II and III are devoted to a summary of the established theoretical results obtained to date. Part II (Chapters 5 through 14) deals with the theory of three-layer series-coupled perceptrons, on which most work has been done to date. Part III (Chapters 15 through 20) deals with the theory of multi-layer and cross-coupled perceptrons. Part IV is concerned with more speculative models and problems for future analysis. Of necessity, the final chapters become increasingly heuristic in character, as the theory of perceptrons is not yet complete, and new possibilities are continually coming to light.}, address = {Buffalo 21, N.Y.}, author = {Rosenblatt, F}, file = {:share/imagedb/perellm1/references/Rosenblatt\_1961\_Principles of neurodynamics. perceptrons and the theory of brain mechanisms.pdf:pdf}, institution = {Cornell Aeronautical Laboratory, INC.}, keywords = {*CYBERNETICS,*NERVOUS SYSTEM,*NEURODYNAMICS,*PERCEPTION,Anatomy and Physiology,Brain,Computers,Cybernetics,Mathematical analysis,Psychology,THEORY,mscthesis}, language = {en}, mendeley-tags = {*CYBERNETICS,*NERVOUS SYSTEM,*NEURODYNAMICS,*PERCEPTION,Anatomy and Physiology,Brain,Computers,Cybernetics,Mathematical analysis,Psychology,THEORY,mscthesis}, month = mar, title = {{Principles of neurodynamics. perceptrons and the theory of brain mechanisms}}, url = {http://www.dtic.mil/cgi-bin/GetTRDoc?AD=AD0256582 http://oai.dtic.mil/oai/oai?verb=getRecord\&metadataPrefix=html\&identifier=AD0256582}, year = {1961} }  @article{Schindler2008, abstract = {Visual recognition of human actions in video clips has been an active field of research in recent years. However, most published methods either analyse an entire video and assign it a single action label, or use relatively large look-ahead to classify each frame. Contrary to these strategies, human vision proves that simple actions can be recognised almost instantaneously. In this paper, we present a system for action recognition from very short sequences (ldquosnippetsrdquo) of 1-10 frames, and systematically evaluate it on standard data sets. It turns out that even local shape and optic flow for a single frame are enough to achieve ap90\% correct recognitions, and snippets of 5-7 frames (0.3-0.5 seconds of video) are enough to achieve a performance similar to the one obtainable with the entire video sequence.}, author = {Schindler, Konrad and van Gool, Luc}, doi = {10.1109/CVPR.2008.4587730}, file = {:home/perellm1/.local/share/data/Mendeley Ltd./Mendeley Desktop/Downloaded/Schindler, van Gool - 2008 - Action snippets How many frames does human action recognition require.pdf:pdf}, isbn = {978-1-4244-2242-5}, journal = {2008 IEEE Conference on Computer Vision and Pattern Recognition}, keywords = {mscthesis}, mendeley-tags = {mscthesis}, month = jun, pages = {1--8}, publisher = {Ieee}, title = {{Action snippets: How many frames does human action recognition require?}}, url = {http://ieeexplore.ieee.org/lpdocs/epic03/wrapper.htm?arnumber=4587730}, year = {2008} }  @article{Schuldt2004, abstract = {Local space-time features capture local events in video and can be adapted to the size, the frequency and the velocity of moving patterns. In this paper, we demonstrate how such features can be used for recognizing complex motion patterns. We construct video representations in terms of local space-time features and integrate such representations with SVM classification schemes for recognition. For the purpose of evaluation we introduce a new video database containing 2391 sequences of six human actions performed by 25 people in four different scenarios. The presented results of action recognition justify the proposed method and demonstrate its advantage compared to other relative approaches for action recognition.}, author = {Schuldt, C and Laptev, I and Caputo, B}, file = {:home/perellm1/.local/share/data/Mendeley Ltd./Mendeley Desktop/Downloaded/Schuldt, Laptev, Caputo - 2004 - Recognizing human actions a local SVM approach.pdf:pdf}, journal = {Pattern Recognition, 2004. \ldots}, keywords = {mscthesis}, mendeley-tags = {mscthesis}, pages = {3--7}, title = {{Recognizing human actions: a local SVM approach}}, url = {http://ieeexplore.ieee.org/xpls/abs\_all.jsp?arnumber=1334462}, year = {2004} }  @article{Simonyan2014a, abstract = {We investigate architectures of discriminatively trained deep Convolutional Networks (ConvNets) for action recognition in video. The challenge is to capture the complementary information on appearance from still frames and motion between frames. We also aim to incorporate into the network design aspects of the best performing hand-crafted features. Our contribution is three-fold. First, we propose a two-stream ConvNet architecture which incorporates spatial and temporal networks. Second, we demonstrate that a ConvNet trained on multi-frame dense optical flow is able to achieve very good performance in spite of limited training data. Finally, we show that multi-task learning, applied to two different action classification datasets, can be used to increase the amount of training data and improve the performance on both. Our architecture is trained and evaluated on the standard video actions benchmarks of UCF-101 and HMDB-51, where it matches the state of the art. It also exceeds by a large margin previous attempts to use deep nets for video classification.}, annote = {\backslash$begin\{itemize\}$\backslash$item Action recognition using two paths on a CNN$\backslash$item First one using only frames$\backslash$item Second one using optical flow$\backslash$item Datasets$\backslash$begin\{itemize\}$\backslash$item ImageNet ILSVRC-2012 (pretraining)$\backslash$item UCF-101: 9.5K videos$\backslash$item HMDB-15: 3.7K videos$\backslash$end\{itemize\}$\backslash$item Biological inspiration by two paths on our visual cortex$\backslash$begin\{itemize\}$\backslash$item Ventral stream performs object recognition$\backslash$item Dorsal stream recognises motion$\backslash$end\{itemize\}$\backslash$item Action recognition approaches commonly use$\backslash$begin\{itemize\}$\backslash$item High dimensional encodings of spatio-temporal features$\backslash$item Classification with shallow methods$\backslash$item Some of the features extracted by:$\backslash$begin\{itemize\}$\backslash$item Histogram of Oriented Gradients (HOG)$\backslash$item Histogram of Optical Flow (HOF)$\backslash$end\{itemize\}$\backslash$item Then features merged with Bag Of Features (BoF)$\backslash$item Final classification using SVM$\backslash$item state-of-the-art Motion Boundary Histogram (MBH)$\backslash$item Compensation of camera motion is very important$\backslash$item Fisher vector encodings (deep version on$\backslash$cite\{Simonyan2013a\}$\backslash$end\{itemize\}$\backslash$item Two methods for merging the two CNN softmax layers$\backslash$begin\{itemize\}$\backslash$item Averaging their outputs$\backslash$item Training a multi-class linear SVM$\backslash$end\{itemize\}$\backslash$item The two CNN$\backslash$begin\{itemize\}$\backslash$item Spatial stream ConvNet:$\backslash$begin\{itemize\}$\backslash$item Individual frames$\backslash$item It can be pretrained with image datasets (Ex: ImageNet)$\backslash$end\{itemize\}$\backslash$item Optical flow ConvNet options:$\backslash$begin\{itemize\}$\backslash$item Optical flow stacking: From L frames extract L+1 optical flow input channels$\backslash$item Trajectory stacking: This follows the optical flows as following the different objects. -> -----> ---> -> -->$\backslash$item Bi-directional optical flow: Like mirroring in images, it is possible to use Forward and Backward optical flows (data augmentation?)$\backslash$item Mean flow subtraction: To center the inputs on the non-linearity center. Accentuated sometimes by camera motion. They solve this problem subtracting the mean of each displacement, this is less computational costly, but also less precise$\backslash$end\{itemize\}$\backslash$end\{itemize\}$\backslash$item Multi-task learning$\backslash$begin\{itemize\}$\backslash$item On top of the CNN two softmax layers are added$\backslash$item One is only trained for HMDB-51 while the other on UCF-101$\backslash$end\{itemize\}$\backslash$item Training$\backslash$begin\{itemize\}$\backslash$item Random crop$\backslash$item Random mirroring$\backslash$item Random RGB jittering$\backslash$item learning rate: 0.01, 0.001, 0.0001$\backslash$end\{itemize\}$\backslash$item Results$\backslash$begin\{itemize\}$\backslash$item Pretraining with ILSVRC-2012 improves results$\backslash$item Optical flow in general works better than extracting this information from pairs of images$\backslash$item Temporal and spatial information is complementary$\backslash$item Augmenting the data is very beneficial$\backslash$item Pretraining with large amounts of images improves the generalization$\backslash$end\{itemize\}$\backslash$end\{itemize\}}, author = {Simonyan, Karen and Zisserman, Andrew}, file = {:home/perellm1/.local/share/data/Mendeley Ltd./Mendeley Desktop/Downloaded/Simonyan, Zisserman - 2014 - Two-Stream Convolutional Networks for Action Recognition in Videos(2).pdf:pdf;:home/perellm1/.local/share/data/Mendeley Ltd./Mendeley Desktop/Downloaded/Simonyan, Zisserman - 2014 - Two-Stream Convolutional Networks for Action Recognition in Videos(2).html:html}, journal = {Advances in Neural Information \ldots}, keywords = {Computer Science - Computer Vision and Pattern Rec,mscthesis}, mendeley-tags = {Computer Science - Computer Vision and Pattern Rec,mscthesis}, month = jun, title = {{Two-stream convolutional networks for action recognition in videos}}, url = {http://arxiv.org/abs/1406.2199 http://www.arxiv.org/pdf/1406.2199.pdf http://papers.nips.cc/paper/5353-two-stream-convolutional-networks-for-action-recognition-in-videos}, year = {2014} }  @article{Srivastava2014, abstract = {Deep neural nets with a large number of parameters are very powerful machine learning systems. However, overfitting is a serious problem in such networks. Large networks are also slow to use, making it difficult to deal with overfitting by combining the predictions of many different large neural nets at test time. Dropout is a technique for addressing this problem. The key idea is to randomly drop units (along with their connections) from the neural network during training. This prevents units from co-adapting too much. During training, dropout samples from an exponential number of different “thinned” networks. At test time, it is easy to approximate the effect of averaging the predictions of all these thinned networks by simply using a single unthinned network that has smaller weights. This significantly reduces overfitting and gives major improvements over other regularization methods. We show that dropout improves the performance of neural networks on supervised learning tasks in vision, speech recognition, document classification and computational biology, obtaining state-of-the-art results on many benchmark data sets.}, author = {Srivastava, N and Hinton, Geoffrey}, file = {:share/imagedb/perellm1/references/Srivastava, Hinton\_2014\_Dropout A Simple Way to Prevent Neural Networks from Overfitting.pdf:pdf}, journal = {Journal of Machine \ldots}, keywords = {deep learning,model combination,mscthesis,neural networks,regularization}, mendeley-tags = {mscthesis}, pages = {1929--1958}, title = {{Dropout: A Simple Way to Prevent Neural Networks from Overfitting}}, url = {http://jmlr.org/papers/v15/srivastava14a.html}, volume = {15}, year = {2014} }  @article{Svozil1997, author = {Svozil, Daniel and Kvasnicka, V and Pospichal, Jie}, file = {:home/perellm1/.local/share/data/Mendeley Ltd./Mendeley Desktop/Downloaded/Svozil, Kvasnicka, Pospichal - 1997 - Introduction to multi-layer feed-forward neural networks.pdf:pdf}, journal = {Chemometrics and intelligent \ldots}, keywords = {mscthesis,neural networks}, mendeley-tags = {mscthesis}, pages = {43--62}, title = {{Introduction to multi-layer feed-forward neural networks}}, url = {http://www.sciencedirect.com/science/article/pii/S0169743997000610}, volume = {39}, year = {1997} }  @article{Szegedy2014, abstract = {We propose a deep convolutional neural network architecture codenamed "Inception", which was responsible for setting the new state of the art for classification and detection in the ImageNet Large-Scale Visual Recognition Challenge 2014 (ILSVRC 2014). The main hallmark of this architecture is the improved utilization of the computing resources inside the network. This was achieved by a carefully crafted design that allows for increasing the depth and width of the network while keeping the computational budget constant. To optimize quality, the architectural decisions were based on the Hebbian principle and the intuition of multi-scale processing. One particular incarnation used in our submission for ILSVRC 2014 is called GoogLeNet, a 22 layers deep network, the quality of which is assessed in the context of classification and detection.}, author = {Szegedy, Christian and Liu, Wei and Jia, Yangqing and Sermanet, Pierre}, file = {:home/perellm1/.local/share/data/Mendeley Ltd./Mendeley Desktop/Downloaded/WTGZA6Q7/Szegedy et al. - 2014 - Going Deeper with Convolutions.pdf:pdf;:home/perellm1/.local/share/data/Mendeley Ltd./Mendeley Desktop/Downloaded/8BWWB6AU/1409.html:html}, journal = {arXiv preprint arXiv: \ldots}, keywords = {Computer Science - Computer Vision and Pattern Rec,mscthesis}, mendeley-tags = {Computer Science - Computer Vision and Pattern Rec,mscthesis}, month = sep, title = {{Going deeper with convolutions}}, url = {http://arxiv.org/abs/1409.4842 http://www.arxiv.org/pdf/1409.4842.pdf}, year = {2014} }  @article{Uttley1956, author = {Uttley, A. M.}, journal = {Automata Studies}, keywords = {mscthesis}, mendeley-tags = {mscthesis}, pages = {277-- 285}, title = {{Temporal and spatial patterns in a conditional probability machine}}, year = {1956} }  @article{Vemuri1992, address = {Los Alamitos, Calif}, author = {Vemuri, V. Rao}, isbn = {9780818690693}, journal = {ieeexplore.ieee.org}, keywords = {mscthesis}, language = {English}, mendeley-tags = {mscthesis}, month = sep, pages = {509}, publisher = {Ieee Computer Society}, shorttitle = {Artificial Neural Networks}, title = {{Artificial Neural Networks: Concepts and Control Applications}}, url = {http://www.amazon.com/Artificial-Neural-Networks-Concepts-Applications/dp/0818690690 http://ieeexplore.ieee.org/iel1/38/5255/x0323385.pdf}, year = {1992} }  @article{Voravuthikunchai2014, abstract = {This paper introduces a novel image representation capturing feature dependencies through the mining of meaningful combinations of visual features. This representation leads to a compact and discriminative encoding of images that can be used for image classification, object detection or object recognition. The method relies on (i) multiple random projections of the input space followed by local binarization of projected histograms encoded as sets of items, and (ii) the representation of images as Histograms of Pattern Sets (HoPS). The approach is validated on four publicly available datasets (Daimler Pedestrian, Oxford Flowers, KTH Texture and PASCAL VOC2007), allowing comparisons with many recent approaches. The proposed image representation reaches state-of-the-art performance on each one of these datasets.}, annote = {$\backslash$begin\{itemize\}$\backslash$item Pattern mining$\backslash$item dimentionality reduction$\backslash$item feature selection$\backslash$item feature augmentation$\backslash$item Common image representations with real-valued histograms$\backslash$begin\{itemize\}$\backslash$item Local Binary Patterns (LBP)$\backslash$item Histograms of Oriented Gradients (HOG)$\backslash$item Bag-of-Words (BoW)$\backslash$end\{itemize\}$\backslash$item Authors propose Histograms of Pattern Sets (HoPS).$\backslash$begin\{itemize\}$\backslash$item Extract some features from the images, for example BoW.$\backslash$item Randomly select N features (in this case visual words)$\backslash$item Binarize the selected feature histograms$\backslash$begin\{itemize\}$\backslash$item The top-K selected features with higher occurrencies are set to one.$\backslash$item The rest is set to zero.$\backslash$item Group the features with value 1 as a transaction of size K$\backslash$end\{itemize\}$\backslash$item Repeat the random selection P times and create one transaction at each step$\backslash$item Apply data mining techniques to select the most discriminative transactions, for example:$\backslash$begin\{itemize\}$\backslash$item Frequent Patterns (FPs)$\backslash$cite\{Agrawal1993a\}$\backslash$item Jumping Emerging Patterns (JEPs)$\backslash$cite\{Dong1999\}$\backslash$begin\{itemize\}$\backslash$item positive JEPs: random projections that are found only in the positive images$\backslash$item negative JEPs: random projections that are found onluy in the negative images$\backslash$end\{itemize\}$\backslash$end\{itemize\}$\backslash$item The final representaiton is a histogram of 2xP bins where P are the total number of projections and one positive JEP and negative JEP per projection$\backslash$item Train a classifier with this representation$\backslash$item Results$\backslash$begin\{itemize\}$\backslash$item A linear SVM trained with HoPS improved the performance on the original features from 68.8 to 74.1$\backslash$item A RBF-Chi\^{}2 trained with HoPs improved the performance on the original features from 71.2 to 73.7$\backslash$item Using HoPS and original features togheter improves.$\backslash$end\{itemize\}$\backslash$item Image classification: state-of-the-art in Oxford-Flowers 17 dataset$\backslash$item Texture recognition: good results on KTH-TIPS2a$\backslash$item Object detection: state-of-the-art in PASCAL VOC 2007 dataset$\backslash$item Pedestrian recognition: state-of-the-art in pedestrian recognition$\backslash$end\{itemize\}$\backslash$end\{itemize\}}, author = {Voravuthikunchai, Winn}, file = {:home/perellm1/.local/share/data/Mendeley Ltd./Mendeley Desktop/Downloaded/V88DIGQ5/Voravuthikunchai et al. - 2013 - Histograms of Pattern Sets for Image Classificatio.pdf:pdf;:home/perellm1/.local/share/data/Mendeley Ltd./Mendeley Desktop/Downloaded/5J9JW9PC/Voravuthikunchai\_Histograms\_of\_Pattern\_2014\_CVPR\_paper.html:html}, journal = {IEEE Conference on \ldots}, keywords = {mscthesis}, mendeley-tags = {mscthesis}, pages = {224--231}, title = {{Histograms of pattern sets for image classification and object recognition}}, url = {http://www.cv-foundation.org/openaccess/content\_cvpr\_2014/html/Voravuthikunchai\_Histograms\_of\_Pattern\_2014\_CVPR\_paper.html http://www.cv-foundation.org/openaccess/content\_cvpr\_2014/papers/Voravuthikunchai\_Histograms\_of\_Pattern\_2014\_CVPR\_paper.pdf http://hal.archives-ouvertes.fr/hal-00980894/}, year = {2014} }  @article{Wang2009, abstract = {Local space-time features have recently become a popular video representation for action recognition. Several methods for feature localization and description have been proposed in the literature and promising recognition results were demonstrated for a number of action classes. The comparison of existing methods, however, is often limited given the different experimental settings used. The purpose of this paper is to evaluate and compare previously proposed space-time features in a common experimental setup. In particular, we consider four different feature detectors and six local feature descriptors and use a standard bag-of-features SVM approach for action recognition. We investigate the performance of these methods on a total of 25 action classes distributed over three datasets with varying difficulty. Among interesting conclusions, we demonstrate that regular sampling of space-time features consistently outperforms all tested space-time interest point detectors for human actions in realistic settings. We also demonstrate a consistent ranking for the majority of methods over different datasets and discuss their advantages and limitations.}, annote = {$\backslash$begin\{itemize\}$\backslash$item Detectors$\backslash$begin\{itemize\}$\backslash$item Harris3D$\backslash$item Cuboid$\backslash$item Hessian$\backslash$item Dense sampling$\backslash$end\{itemize\}$\backslash$item Descriptors$\backslash$begin\{itemize\}$\backslash$item HOG/HOF$\backslash$item HOG3D$\backslash$item ESURF (extended SURF)$\backslash$end\{itemize\}$\backslash$item Datasets$\backslash$begin\{itemize\}$\backslash$item KTH actions$\backslash$begin\{itemize\}$\backslash$item 6 human action classes$\backslash$item walking, jogging, running, boxing, waving and clapping$\backslash$item 25 subjects$\backslash$item 4 scenarios$\backslash$item 2391 video samples$\backslash$item$\backslash$url\{http://www.nada.kth.se/cvap/actions/\}$\backslash$end\{itemize\}$\backslash$item UCF sport actions$\backslash$begin\{itemize\}$\backslash$item 10 human action classes$\backslash$item winging, diving, kicking, weight-lifting, horse-riding, running, skateboarding, swinging, golf swinging and walking$\backslash$item 150 video samples$\backslash$item$\backslash$url\{http://crcv.ucf.edu/data/UCF$\backslash$\_Sports$\backslash$\_Action.php\}$\backslash$end\{itemize\}$\backslash$item Hollywood2 actions$\backslash$begin\{itemize\}$\backslash$item 12 action classes$\backslash$item answering the hone, driving car, eating, fighting, geting out of the car, hand shaking, hugging, kissing, running, sitting down, sitting up, and standing up.$\backslash$item 69 Hollywood movies$\backslash$item 1707 video samples$\backslash$item$\backslash$url\{http://www.di.ens.fr/$\backslash$\~{}laptev/actions/hollywood2/\}$\backslash$end\{itemize\}$\backslash$end\{itemize\}$\backslash$end\{itemize\}}, author = {Wang, Heng and Ullah, Muhammad Muneeb and Klaser, Alexander and Laptev, Ivan and Schmid, Cordelia}, doi = {10.5244/C.23.124}, file = {:home/perellm1/.local/share/data/Mendeley Ltd./Mendeley Desktop/Downloaded/Wang et al. - 2009 - Evaluation of local spatio-temporal features for action recognition(4).pdf:pdf;:home/perellm1/.local/share/data/Mendeley Ltd./Mendeley Desktop/Downloaded/Wang et al. - 2009 - Evaluation of local spatio-temporal features for action recognition(5).pdf:pdf}, isbn = {1-901725-39-1}, journal = {Procedings of the British Machine Vision Conference 2009}, keywords = {mscthesis}, mendeley-tags = {mscthesis}, pages = {124.1--124.11}, publisher = {British Machine Vision Association}, title = {{Evaluation of local spatio-temporal features for action recognition}}, url = {http://www.bmva.org/bmvc/2009/Papers/Paper143/Paper143.html}, year = {2009} }  @techreport{Widrow1960, address = {Stanford, California}, author = {Widrow, Bernard}, file = {:share/imagedb/perellm1/references/Widrow\_1960\_An Adaptive ADALINE Neuron Using Chemical Memistors.pdf:pdf}, institution = {Solid-State Electronics Laboratory, Stanford Electronics Laboratories, Stanford University}, keywords = {mscthesis}, mendeley-tags = {mscthesis}, title = {{An Adaptive "ADALINE" Neuron Using Chemical "Memistors"}}, year = {1960} }  @article{Williams1989b, abstract = {The exact form of a gradient-following learning algorithm for completely recurrent networks running in continually sampled time is derived and used as the basis for practical algorithms for temporal supervised learning tasks. These algorithms have (1) the advantage that they do not require a precisely defined training interval, operating while the network runs; and (2) the disadvantage that they require nonlocal communication in the network being trained and are computationally expensive. These algorithms allow networks having recurrent connections to learn complex tasks that require the retention of information over time periods having either fixed or indefinite length.}, author = {Williams, RJ and Zipser, David}, doi = {10.1162/neco.1989.1.2.270}, file = {:home/perellm1/.local/share/data/Mendeley Ltd./Mendeley Desktop/Downloaded/UNZF46W3/Williams and Zipser - 1989 - A Learning Algorithm for Continually Running Fully.pdf:pdf;:home/perellm1/.local/share/data/Mendeley Ltd./Mendeley Desktop/Downloaded/KCHBMQ7X/neco.1989.1.2.html:html;:home/perellm1/.local/share/data/Mendeley Ltd./Mendeley Desktop/Downloaded/IIVSQQ8E/neco.1989.1.2.html:html}, issn = {0899-7667}, journal = {Neural computation}, keywords = {mscthesis}, mendeley-tags = {mscthesis}, month = jun, number = {2}, pages = {270--280}, title = {{A learning algorithm for continually running fully recurrent neural networks}}, url = {http://dx.doi.org/10.1162/neco.1989.1.2.270 http://www.mitpressjournals.org/doi/abs/10.1162/neco.1989.1.2.270\#.U-N6-NaA8UB http://www.mitpressjournals.org/doi/pdf/10.1162/neco.1989.1.2.270\#.U-N6-NaA8UB http://www.mitpressjournals.org/doi/abs/10.1162/neco.1989.1.2.270 http://www.mitpressjournals.org/doi/abs/10.1162/neco.1989.1.2.270\#.U-OC09aA8UC http://www.mitpressjournals.org/doi/pdf/10.1162/neco.1989.1.2.270\#.U-OC09aA8UC}, volume = {1}, year = {1989} }  @article{Young1802, author = {Young, Thomas}, doi = {10.1098/rstl.1802.0004}, file = {:home/perellm1/.local/share/data/Mendeley Ltd./Mendeley Desktop/Downloaded/WMWNTXRD/Young - 1802 - The Bakerian Lecture On the Theory of Light and C.pdf:pdf;:home/perellm1/.local/share/data/Mendeley Ltd./Mendeley Desktop/Downloaded/W5Z8VP43/12.html:html}, issn = {0261-0523,}, journal = {Philosophical transactions of the Royal Society of \ldots}, language = {en}, month = jan, pages = {12--48}, shorttitle = {The Bakerian Lecture}, title = {{The Bakerian lecture: On the theory of light and colours}}, url = {http://rstl.royalsocietypublishing.org/content/92/12 http://rstl.royalsocietypublishing.org/content/92/12.full.pdf http://www.jstor.org/stable/107113}, volume = {92}, year = {1802} }  @article{Zhang2007, abstract = {Recently, methods based on local image features have shown promise for texture and object recognition tasks. This paper presents a large-scale evaluation of an approach that represents images as distributions (signatures or histograms) of features extracted from a sparse set of keypoint locations and learns a Support Vector Machine classifier with kernels based on two effective measures for comparing distributions, the Earth Mover’s Distance and the$\chi$2 distance. We first evaluate the performance of our approach with different keypoint detectors and descriptors, as well as different kernels and classifiers. We then conduct a comparative evaluation with several state-of-the-art recognition methods on four texture and five object databases. On most of these databases, our implementation exceeds the best reported results and achieves comparable performance on the rest. Finally, we investigate the influence of background correlations on recognition performance via extensive tests on the PASCAL database, for which ground-truth object localization information is available. Our experiments demonstrate that image representations based on distributions of local features are surprisingly effective for classification of texture and object images under challenging real-world conditions, including significant intra-class variations and substantial background clutter.}, author = {Zhang, J. and Marszalek, M.}, doi = {10.1007/s11263-006-9794-4}, file = {:home/perellm1/.local/share/data/Mendeley Ltd./Mendeley Desktop/Downloaded/4QHV37R2/Zhang et al. - 2007 - Local Features and Kernels for Classification of T.pdf:pdf;:home/perellm1/.local/share/data/Mendeley Ltd./Mendeley Desktop/Downloaded/S3WK47EE/s11263-006-9794-4.html:html}, issn = {0920-5691, 1573-1405}, journal = {International journal of \ldots}, keywords = {Artificial Intelligence (incl. Robotics),Computer Imaging- Vision- Pattern Recognition and,Image Processing and Computer Vision,Pattern Recognition,image classification,kernel methods,keypoints,mscthesis,object recognition,scale- and affine-invariant,support vector machines,texture recognition}, language = {en}, mendeley-tags = {Artificial Intelligence (incl. Robotics),Computer Imaging- Vision- Pattern Recognition and,Image Processing and Computer Vision,Pattern Recognition,image classification,kernel methods,keypoints,mscthesis,object recognition,scale- and affine-invariant,support vector machines,texture recognition}, month = jun, number = {2}, pages = {213--238}, shorttitle = {Local Features and Kernels for Classification of T}, title = {{Local features and kernels for classification of texture and object categories: A comprehensive study}}, url = {http://link.springer.com/article/10.1007/s11263-006-9794-4 http://link.springer.com/content/pdf/10.1007/s11263-006-9794-4.pdf}, volume = {73}, year = {2007} }  @article{Boser1992, abstract = {A training algorithm that maximizes the margin between the training patterns and the decision boundary is presented. The technique is applicable to a wide variety of classifiaction functions, including Perceptrons, polynomials, and Radial Basis Functions. The effective number of parameters is adjusted automatically to match the complexity of the problem. The solution is expressed as a linear combination of supporting patterns. These are the subset of training patterns that are closest to the decision boundary. Bounds on the generalization performance based on the leave-one-out method and the VC-dimension are given. Experimental results on optical character recognition problems demonstrate the good generalization obtained when compared with other learning algorithms. 1 INTRODUCTION Good generalization performance of pattern classifiers is achieved when the capacity of the classification function is matched to the size of the training set. Classifiers with a large numb...}, author = {Boser, BE and Guyon, IM and Vapnik, VN}, file = {:home/perellm1/.local/share/data/Mendeley Ltd./Mendeley Desktop/Downloaded/6JBZ3UQ4/summary.html:html;:home/perellm1/.local/share/data/Mendeley Ltd./Mendeley Desktop/Downloaded/S5B5ADGH/Boser et al. - 1992 - A Training Algorithm for Optimal Margin Classifier.pdf:pdf;:share/imagedb/perellm1/references/Boser, Guyon, Vapnik\_1992\_A training algorithm for optimal margin classifiers.pdf:pdf}, journal = {\ldots of the fifth annual workshop on \ldots}, keywords = {mscthesis}, mendeley-tags = {mscthesis}, pages = {144--152}, publisher = {ACM Press}, title = {{A training algorithm for optimal margin classifiers}}, url = {http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.21.3818\&rep=rep1\&type=pdf http://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.21.3818 http://dl.acm.org/citation.cfm?id=130401}, year = {1992} }  @article{Brown2007, abstract = {This paper concerns the problem of fully automated panoramic image stitching. Though the 1D problem (single axis of rotation) is well studied, 2D or multi-row stitching is more difficult. Previous approaches have used human input or restrictions on the image sequence in order to establish matching images. In this work, we formulate stitching as a multi-image matching problem, and use invariant local features to find matches between all of the images. Because of this our method is insensitive to the ordering, orientation, scale and illumination of the input images. It is also insensitive to noise images that are not part of a panorama, and can recognise multiple panoramas in an unordered image dataset. In addition to providing more detail, this paper extends our previous work in the area (Brown and Lowe, 2003) by introducing gain compensation and automatic straightening steps.}, author = {Brown, Matthew and Lowe, DG}, doi = {10.1007/s11263-006-0002-3}, file = {:home/perellm1/.local/share/data/Mendeley Ltd./Mendeley Desktop/Downloaded/J4A9ZP5T/Brown and Lowe - 2007 - Automatic Panoramic Image Stitching using Invarian.pdf:pdf;:home/perellm1/.local/share/data/Mendeley Ltd./Mendeley Desktop/Downloaded/VISCSHJJ/s11263-006-0002-3.html:html}, issn = {0920-5691, 1573-1405}, journal = {International journal of computer vision}, keywords = {Artificial Intelligence (incl. Robotics),Computer Imaging- Vision- Pattern Recognition and Graphics,Image Processing and Computer Vision,Pattern Recognition,multi-image matching,recognition,stitching}, language = {en}, mendeley-tags = {Artificial Intelligence (incl. Robotics),Computer Imaging- Vision- Pattern Recognition and Graphics,Image Processing and Computer Vision,Pattern Recognition,multi-image matching,recognition,stitching}, month = aug, number = {1}, pages = {59--73}, title = {{Automatic panoramic image stitching using invariant features}}, url = {http://link.springer.com/article/10.1007/s11263-006-0002-3 http://link.springer.com/content/pdf/10.1007/s11263-006-0002-3.pdf}, volume = {74}, year = {2007} }  @article{Dalal2005, abstract = {We study the question of feature sets for robust visual object recognition; adopting linear SVM based human detection as a test case. After reviewing existing edge and gradient based descriptors, we show experimentally that grids of histograms of oriented gradient (HOG) descriptors significantly outperform existing feature sets for human detection. We study the influence of each stage of the computation on performance, concluding that fine-scale gradients, fine orientation binning, relatively coarse spatial binning, and high-quality local contrast normalization in overlapping descriptor blocks are all important for good results. The new approach gives near-perfect separation on the original MIT pedestrian database, so we introduce a more challenging dataset containing over 1800 annotated human images with a large range of pose variations and backgrounds.}, author = {Dalal, N. and Triggs, B.}, doi = {10.1109/CVPR.2005.177}, file = {:home/perellm1/.local/share/data/Mendeley Ltd./Mendeley Desktop/Downloaded/67RIXPKJ/Dalal and Triggs - 2005 - Histograms of oriented gradients for human detecti.pdf:pdf;:home/perellm1/.local/share/data/Mendeley Ltd./Mendeley Desktop/Downloaded/H8PHTUG7/abs\_all.html:html}, journal = {\ldots and Pattern Recognition, 2005. CVPR 2005 \ldots}, keywords = {High performance computing,Histograms,Humans,Image databases,Image edge detection,Object detection,Robustness,Testing,coarse spatial binning,contrast normalization,edge based descriptors,feature extraction,fine orientation binning,fine-scale gradients,gradient based descriptors,gradient methods,histograms of oriented gradients,human detection,linear SVM,mscthesis,object recognition,overlapping descriptor,pedestrian database,robust visual object recognition,support vector machines}, mendeley-tags = {High performance computing,Histograms,Humans,Image databases,Image edge detection,Object detection,Robustness,Testing,coarse spatial binning,contrast normalization,edge based descriptors,feature extraction,fine orientation binning,fine-scale gradients,gradient based descriptors,gradient methods,histograms of oriented gradients,human detection,linear SVM,mscthesis,object recognition,overlapping descriptor,pedestrian database,robust visual object recognition,support vector machines}, month = jun, pages = {886--893 vol. 1}, title = {{Histograms of oriented gradients for human detection}}, url = {http://ieeexplore.ieee.org/ielx5/9901/31472/01467360.pdf?tp=\&arnumber=1467360\&isnumber=31472 http://ieeexplore.ieee.org/xpls/abs\_all.jsp?arnumber=1467360}, volume = {1}, year = {2005} }  @article{Ji2013, abstract = {We consider the automated recognition of human actions in surveillance videos. Most current methods build classifiers based on complex handcrafted features computed from the raw inputs. Convolutional neural networks (CNNs) are a type of deep model that can act directly on the raw inputs. However, such models are currently limited to handling 2D inputs. In this paper, we develop a novel 3D CNN model for action recognition. This model extracts features from both the spatial and the temporal dimensions by performing 3D convolutions, thereby capturing the motion information encoded in multiple adjacent frames. The developed model generates multiple channels of information from the input frames, and the final feature representation combines information from all channels. To further boost the performance, we propose regularizing the outputs with high-level features and combining the predictions of a variety of different models. We apply the developed models to recognize human actions in the real-world environment of airport surveillance videos, and they achieve superior performance in comparison to baseline methods.}, annote = {$\backslash$begin\{itemize\}$\backslash$item Participated on TRECVID 2009$\backslash$item Videos with static camera$\backslash$item 3D - CNN$\backslash$begin\{itemize\}$\backslash$item input 7@60x40$\backslash$item Hardwired 33@60x40$\backslash$begin\{itemize\}$\backslash$item Gray$\backslash$item gradient-x$\backslash$item gradient-y$\backslash$item opticalflow-x$\backslash$item opticalflow-y$\backslash$end\{itemize\}$\backslash$item Convolution 7x7x3$\backslash$item C2 = 23*2@54x34$\backslash$item Subsampling 2x2$\backslash$item S3 = 23*2@27x17$\backslash$item Convolution 7x6x3$\backslash$item C4 = 13*6@21x12$\backslash$item Subsampling 3x3$\backslash$item S5 = 13*6@7x4$\backslash$item Convolution 7x4$\backslash$item C6 = 128@7x4$\backslash$item Fully connected layer$\backslash$end\{itemize\}$\backslash$item Datasets$\backslash$begin\{itemize\}$\backslash$item Surveillance Event Detection$\backslash$item Action classes$\backslash$begin\{itemize\}$\backslash$item CellToEar$\backslash$item ObjectPut$\backslash$item Pointing$\backslash$end\{itemize\}$\backslash$begin\{itemize\}$\backslash$item method$\backslash$begin\{itemize\}$\backslash$item Humman detector to locate human head$\backslash$item Create a bounding box with 7 time frames and 60x40 spatial pixels$\backslash$item$\backslash$end\{itemize\}$\backslash$item Best results on three tasks$\backslash$end\{itemize\}$\backslash$item KTH$\backslash$begin\{itemize\}$\backslash$item Comptetitive performance$\backslash$end\{itemize\}$\backslash$end\{itemize\}$\backslash\$end\{itemize\}},
author = {Ji, Shuiwang and Yang, Ming and Yu, Kai},
doi = {10.1109/TPAMI.2012.59},
file = {:home/perellm1/.local/share/data/Mendeley Ltd./Mendeley Desktop/Downloaded/Ji, Yang, Yu - 2013 - 3D convolutional neural networks for human action recognition.pdf:pdf},
issn = {1939-3539},
journal = {IEEE transactions on pattern analysis and machine intelligence},
keywords = {Algorithms,Automated,Automated: methods,Computer-Assisted,Computer-Assisted: methods,Decision Support Techniques,Image Interpretation,Imaging,Movement,Movement: physiology,Neural Networks (Computer),Pattern Recognition,Subtraction Technique,Three-Dimensional,Three-Dimensional: methods,mscthesis},
mendeley-tags = {mscthesis},
month = jan,
number = {1},
pages = {221--31},
pmid = {22392705},
title = {{3D convolutional neural networks for human action recognition.}},
url = {http://www.ncbi.nlm.nih.gov/pubmed/22392705},
volume = {35},
year = {2013}
}

@inproceedings{Ke2004,
abstract = {Stable local feature detection and representation is a fundamental component of many image registration and object recognition algorithms. Mikolajczyk and Schmid (June 2003) recently evaluated a variety of approaches and identified the SIFT [D. G. Lowe, 1999] algorithm as being the most resistant to common image deformations. This paper examines (and improves upon) the local image descriptor used by SIFT. Like SIFT, our descriptors encode the salient aspects of the image gradient in the feature point's neighborhood; however, instead of using SIFT's smoothed weighted histograms, we apply principal components analysis (PCA) to the normalized gradient patch. Our experiments demonstrate that the PCA-based local descriptors are more distinctive, more robust to image deformations, and more compact than the standard SIFT representation. We also present results showing that using these descriptors in an image retrieval application results in increased accuracy and faster matching.},
author = {Ke, Yan and Sukthankar, R.},
doi = {10.1109/CVPR.2004.1315206},
keywords = {Computer science,Computer vision,Filters,Histograms,Image retrieval,Object detection,Robustness,feature extraction,image deformations,image gradient,image registration,image representation,image retrieval application,local feature detection,local image descriptor,mscthesis,object recognition,object recognition algorithms,principal component analysis,principal components analysis},
mendeley-tags = {Computer science,Computer vision,Filters,Histograms,Image retrieval,Object detection,Robustness,feature extraction,image deformations,image gradient,image registration,image representation,image retrieval application,local feature detection,local image descriptor,mscthesis,object recognition,object recognition algorithms,principal component analysis,principal components analysis},
month = jun,
pages = {II--506--II--513 Vol.2},
shorttitle = {PCA-SIFT},
title = {{PCA-SIFT: a more distinctive representation for local image descriptors}},
url = {http://ieeexplore.ieee.org/ielx5/9183/29134/01315206.pdf?tp=\&arnumber=1315206\&isnumber=29134 http://ieeexplore.ieee.org/xpls/abs\_all.jsp?arnumber=1315206\&tag=1},
volume = {2},
year = {2004}
}

@article{LeCun1989,
abstract = {An interestmg property of connectiomst systems is their ability tolearn from examples. Although most recent work in the field concentrateson reducing learning times, the most important feature of a learning ma-chine is its generalization performance. It is usually accepted that goodgeneralization performance on real-world problems cannot be achievedunless some a pnon knowledge about the task is butlt Into the system.Back-propagation networks provide a way of specifymg such knowledgeby imposing constraints both on the architecture of the network and onits weights. In general, such constramts can be considered as particulartransformations of the parameter spaceBuilding a constramed network for image recogmtton appears to be afeasible task. We descnbe a small handwritten digit recogmtion problemand show that, even though the problem is linearly separable, single layernetworks exhibit poor generalizatton performance. Multtlayer constrainednetworks perform very well on this task when orgamzed in a hierarchicalstructure with shift invariant feature detectors.These results confirm the idea that minimizing the number of freeparameters in the network enhances generalization.},
author = {LeCun, Y},
file = {:home/perellm1/.local/share/data/Mendeley Ltd./Mendeley Desktop/Downloaded/LeCun - 1989 - Generalization and network design strategies.pdf:pdf},
journal = {Connections in Perspective. North-Holland, \ldots},
keywords = {mscthesis},
mendeley-tags = {mscthesis},
title = {{Generalization and network design strategies}},
url = {http://masters.donntu.edu.ua/2012/fknt/umiarov/library/lecun.pdf},
year = {1989}
}

@article{Mikolajczyk2005a,
abstract = {The paper gives a snapshot of the state of the art in affine covariant region detectors, and compares their performance on a set of test images under varying imaging conditions. Six types of detectors are included: detectors based on affine normalization around Harris (Mikolajczyk and Schmid, 2002; Schaffalitzky and Zisserman, 2002) and Hessian points (Mikolajczyk and Schmid, 2002), a detector of ‘maximally stable extremal regions', proposed by Matas et al. (2002); an edge-based region detector (Tuytelaars and Van Gool, 1999) and a detector based on intensity extrema (Tuytelaars and Van Gool, 2000), and a detector of ‘salient regions', proposed by Kadir, Zisserman and Brady (2004). The performance is measured against changes in viewpoint, scale, illumination, defocus and image compression. The objective of this paper is also to establish a reference test set of images and performance software, so that future detectors can be evaluated in the same framework.},
author = {Mikolajczyk, K. and Tuytelaars, T. and Schmid, C. and Zisserman, A. and Matas, J. and Schaffalitzky, F. and Kadir, T. and Gool, L. Van},
doi = {10.1007/s11263-005-3848-x},
issn = {0920-5691, 1573-1405},
journal = {International journal of \ldots},
keywords = {Artificial Intelligence (incl. Robotics),Computer Imaging- Graphics and Computer Vision,Image Processing,Local features,Pattern Recognition,affine region detectors,invariant image description,mscthesis,performance evaluation},
language = {en},
mendeley-tags = {Artificial Intelligence (incl. Robotics),Computer Imaging- Graphics and Computer Vision,Image Processing,Local features,Pattern Recognition,affine region detectors,invariant image description,mscthesis,performance evaluation},
month = nov,
number = {1-2},
pages = {43--72},
title = {{A comparison of affine region detectors}},
volume = {65},
year = {2005}
}

@article{Simonyan2013a,
abstract = {As massively parallel computations have become broadly available with modern GPUs, deep architectures trained on very large datasets have risen in popularity. Discriminatively trained convolutional neural networks, in particular, were recently shown to yield state-of-the-art performance in challenging image classification benchmarks such as ImageNet. However, elements of these architectures are similar to standard hand-crafted representations used in computer vision. In this paper, we explore the extent of this analogy, proposing a version of the state-of-the-art Fisher vector image encoding that can be stacked in multiple layers. This architecture significantly improves on standard Fisher vectors, and obtains competitive results with deep convolutional networks at a significantly smaller computational cost. Our hybrid architecture allows us to measure the performance improvement brought by a deeper image classification pipeline, while staying in the realms of conventional SIFT features and FV encodings.},
author = {Simonyan, Karen and Vedaldi, Andrea and Zisserman, Andrew},
editor = {Burges, C. J. C. and Bottou, L. and Welling, M. and Ghahramani, Z. and Weinberger, K. Q.},
journal = {Advances in neural \ldots},
keywords = {mscthesis},
mendeley-tags = {mscthesis},
pages = {163--171},
publisher = {Curran Associates, Inc.},
title = {{Deep Fisher networks for large-scale image classification}},
url = {http://papers.nips.cc/paper/4926-deep-fisher-networks-for-large-scale-image-classification.pdf http://papers.nips.cc/paper/4926-deep-fisher-networks-for-large-scale-image-classification},
year = {2013}
}

@article{Sommer,
author = {Sommer, Friedrich T and Wennekers, Thomas and Real, El Camino},
file = {:home/perellm1/.local/share/data/Mendeley Ltd./Mendeley Desktop/Downloaded/Sommer, Wennekers, Real - 2003 - Models of distributed associative memory networks in the brain ∗.pdf:pdf},
keywords = {bidirectional associa-,cell assemblies,cognitive modeling,memory systems,mscthesis,reciprocal connections,tive memory},
mendeley-tags = {mscthesis},
number = {1949},
pages = {55--69},
title = {{Models of distributed associative memory networks in the brain ∗}},
volume = {122},
year = {2003}
}

@inproceedings{Taigman,
author = {Taigman, Yaniv and Yang, Ming and Ranzato, Marc Aurelio and Wolf, Lior},
file = {:share/imagedb/perellm1/references/Taigman et al.\_2014\_DeepFace Closing the Gap to Human-Level Performance in Face Verification.pdf:pdf},
keywords = {mscthesis},
mendeley-tags = {mscthesis},
title = {{DeepFace: Closing the Gap to Human-Level Performance in Face Verification}},
url = {http://www.cs.tau.ac.il/~wolf/papers/deepface\_11\_01\_2013.pdf},
year = {2014}
}

@article{Zeiler2013,
abstract = {Large Convolutional Network models have recently demonstrated impressive classification performance on the ImageNet benchmark. However there is no clear understanding of why they perform so well, or how they might be improved. In this paper we address both issues. We introduce a novel visualization technique that gives insight into the function of intermediate feature layers and the operation of the classifier. We also perform an ablation study to discover the performance contribution from different model layers. This enables us to find model architectures that outperform Krizhevsky et.al. on the ImageNet classification benchmark. We show our ImageNet model generalizes well to other datasets: when the softmax classifier is retrained, it convincingly beats the current state-of-the-art results on Caltech-101 and Caltech-256 datasets.},
archiveprefix = {arXiv},
arxivid = {arXiv:1311.2901v3},
author = {Zeiler, MD and Fergus, Rob},
eprint = {arXiv:1311.2901v3},
file = {:home/perellm1/.local/share/data/Mendeley Ltd./Mendeley Desktop/Downloaded/Zeiler, Fergus - 2013 - Visualizing and Understanding Convolutional Networks.pdf:pdf},
journal = {arXiv preprint arXiv:1311.2901},
keywords = {CNN,mscthesis},
mendeley-tags = {CNN,mscthesis},
title = {{Visualizing and Understanding Convolutional Networks}},
url = {http://arxiv.org/abs/1311.2901 http://arxiv.org/pdf/1311.2901.pdf},
year = {2013}
}
`

This file was generated by bibtex2html 1.97.