<?xml version="1.0" encoding="UTF-8"?>
<?xml-stylesheet href="http://feeds.semantikoz.com/~d/styles/rss2full.xsl" type="text/xsl" media="screen"?><?xml-stylesheet href="http://feeds.semantikoz.com/~d/styles/itemcontent.css" type="text/css" media="screen"?><rss xmlns:content="http://purl.org/rss/1.0/modules/content/" xmlns:wfw="http://wellformedweb.org/CommentAPI/" xmlns:dc="http://purl.org/dc/elements/1.1/" xmlns:atom="http://www.w3.org/2005/Atom" version="2.0">

<channel>
	<title>semantikoz</title>
	
	<link>http://www.semantikoz.com</link>
	<description>Semantic Vector Space Research and more ...</description>
	<pubDate>Wed, 03 Dec 2008 21:47:49 +0000</pubDate>
	<generator>http://wordpress.org/?v=2.6.5</generator>
	<language>en</language>
			<atom10:link xmlns:atom10="http://www.w3.org/2005/Atom" rel="self" href="http://feeds.semantikoz.com/Semantikoz" type="application/rss+xml" /><feedburner:feedFlare xmlns:feedburner="http://rssnamespace.org/feedburner/ext/1.0" href="http://add.my.yahoo.com/rss?url=http%3A%2F%2Ffeeds.semantikoz.com%2FSemantikoz" src="http://us.i1.yimg.com/us.yimg.com/i/us/my/addtomyyahoo4.gif">Subscribe with My Yahoo!</feedburner:feedFlare><feedburner:feedFlare xmlns:feedburner="http://rssnamespace.org/feedburner/ext/1.0" href="http://www.newsgator.com/ngs/subscriber/subext.aspx?url=http%3A%2F%2Ffeeds.semantikoz.com%2FSemantikoz" src="http://www.newsgator.com/images/ngsub1.gif">Subscribe with NewsGator</feedburner:feedFlare><feedburner:feedFlare xmlns:feedburner="http://rssnamespace.org/feedburner/ext/1.0" href="http://feeds.my.aol.com/add.jsp?url=http%3A%2F%2Ffeeds.semantikoz.com%2FSemantikoz" src="http://o.aolcdn.com/favorites.my.aol.com/webmaster/ffclient/webroot/locale/en-US/images/myAOLButtonSmall.gif">Subscribe with My AOL</feedburner:feedFlare><feedburner:feedFlare xmlns:feedburner="http://rssnamespace.org/feedburner/ext/1.0" href="http://www.rojo.com/add-subscription?resource=http%3A%2F%2Ffeeds.semantikoz.com%2FSemantikoz" src="http://blog.rojo.com/RojoWideRed.gif">Subscribe with Rojo</feedburner:feedFlare><feedburner:feedFlare xmlns:feedburner="http://rssnamespace.org/feedburner/ext/1.0" href="http://www.bloglines.com/sub/http://feeds.semantikoz.com/Semantikoz" src="http://www.bloglines.com/images/sub_modern11.gif">Subscribe with Bloglines</feedburner:feedFlare><feedburner:feedFlare xmlns:feedburner="http://rssnamespace.org/feedburner/ext/1.0" href="http://www.netvibes.com/subscribe.php?url=http%3A%2F%2Ffeeds.semantikoz.com%2FSemantikoz" src="http://www.netvibes.com/img/add2netvibes.gif">Subscribe with Netvibes</feedburner:feedFlare><feedburner:feedFlare xmlns:feedburner="http://rssnamespace.org/feedburner/ext/1.0" href="http://fusion.google.com/add?feedurl=http%3A%2F%2Ffeeds.semantikoz.com%2FSemantikoz" src="http://buttons.googlesyndication.com/fusion/add.gif">Subscribe with Google</feedburner:feedFlare><feedburner:feedFlare xmlns:feedburner="http://rssnamespace.org/feedburner/ext/1.0" href="http://www.pageflakes.com/subscribe.aspx?url=http%3A%2F%2Ffeeds.semantikoz.com%2FSemantikoz" src="http://www.pageflakes.com/ImageFile.ashx?instanceId=Static_4&amp;fileName=ATP_blu_91x17.gif">Subscribe with Pageflakes</feedburner:feedFlare><item>
		<title>Free Stop Word Lists in 23 Languages</title>
		<link>http://www.semantikoz.com/2008/04/02/free-stop-word-lists-in-23-languages/</link>
		<comments>http://www.semantikoz.com/2008/04/02/free-stop-word-lists-in-23-languages/#comments</comments>
		<pubDate>Wed, 02 Apr 2008 06:16:09 +0000</pubDate>
		<dc:creator>CWP</dc:creator>
		
		<category><![CDATA[semantics]]></category>

		<category><![CDATA[nlp]]></category>

		<category><![CDATA[vector space models]]></category>

		<guid isPermaLink="false">http://www.semantikoz.com/?p=22</guid>
		<description><![CDATA[Stop words or stopwords are used in Natural Language Processing (NLP) to eliminate words that bear no content or relevant semantics. Search engines use stop words to improve the search queries. Google&#8217;s FAQ gives a short explanation here. A stop word list consists mostly of some basic combination of letters and numbers as well as [...]]]></description>
			<content:encoded><![CDATA[<p><img class="alignleft size-full wp-image-23" title="Stop" src="http://www.semantikoz.com/wp-content/uploads/2008/04/old-stop-sign.jpg" alt="Stop" width="128" height="128" />Stop words or stopwords are used in Natural Language Processing (NLP) to eliminate words that bear no content or relevant semantics. Search engines use stop words to improve the search queries. Google&#8217;s FAQ gives a short explanation <a href="http://www.google.com/support/bin/answer.py?answer=981&amp;topic=13912" target="_blank">here</a>. A stop word list consists mostly of some basic combination of letters and numbers as well as pronouns, adverbs, prepositions, some verbs, adjectives and conjunctions.</p>
<p>For example the sentence <em>&#8220;The government did not introduce the tax bill&#8221;</em> could be represented by <em>&#8220;S government S S introduce S tax bill&#8221;</em> with <em>&#8216;S&#8217;</em> standing for a stop word. As a result the amount of data that has to be processed is reduced with a simple matching and removing/replacing of stop words with no or minimal impact on the information contained. There are several lists freely available.<span id="more-22"></span></p>
<p>Find Catalan, Czech, Danish, Dutch, French, English,  German, Hungarian, Italian, Norwegian, Polish, Portugese, Spanish, and a Turkish stop word list at <a title="Stop Word Lists" href="http://www.ranks.nl/stopwords/" target="_blank" class="broken_link">Ranks.nl</a>.</p>
<p>Arabic, Bulgarian, Czech, French, English, Finish, German, Hungarian, Italian, Roumanian, Russian, Spanish, Swedish, Polish and Portuguese stop word lists are available from <a title="Stop Word Lists" href="http://members.unine.ch/jacques.savoy/clef/" target="_blank">Jacques Savoy&#8217;s page</a>.</p>
<p><a title="Snowball Hompage" rel="nofollow" href="http://snowball.tartarus.org/">The snowball project</a> offers English, French, Spanish, German, Portuguese, Italian, Dutch, Swedish, Norwegian, Danish, Russian, Finnish and Hungarian stop lists. As it is part of a stemmer project the lists are not in one place and have to be downloaded from each language page.</p>
<p><em><br />
</em></p>
<img src="http://feeds.semantikoz.com/~r/Semantikoz/~4/262453957" height="1" width="1"/>]]></content:encoded>
			<wfw:commentRss>http://www.semantikoz.com/2008/04/02/free-stop-word-lists-in-23-languages/feed/</wfw:commentRss>
		</item>
		<item>
		<title>Quick, easy, professional - Program for free under Microsoft Windows</title>
		<link>http://www.semantikoz.com/2008/03/05/programming-on-a-shoestring/</link>
		<comments>http://www.semantikoz.com/2008/03/05/programming-on-a-shoestring/#comments</comments>
		<pubDate>Tue, 04 Mar 2008 20:14:24 +0000</pubDate>
		<dc:creator>CWP</dc:creator>
		
		<category><![CDATA[programming]]></category>

		<category><![CDATA[c#]]></category>

		<category><![CDATA[ide]]></category>

		<category><![CDATA[profiling]]></category>

		<category><![CDATA[unit testing]]></category>

		<guid isPermaLink="false">http://www.semantikoz.com/2008/03/05/program-on-a-shoestring/</guid>
		<description><![CDATA[If you are a student, financially constraint or just interested in programming a little, there is a way to program for free and well at the same time! First of all lets assume you are a common guy with common needs so you likely will be using Microsoft Windows, looking to program for Windows possibly [...]]]></description>
			<content:encoded><![CDATA[<p><img class="alignright" src="http://www.semantikoz.com/wp-content/uploads/2008/03/wooden-piggy-bank128.jpg" alt="Wooden Piggy Bank" />If you are a student, financially constraint or just interested in programming a little, there is a way to program for free and well at the same time! First of all lets assume you are a common guy with common needs so you likely will be using Microsoft Windows, looking to program for Windows possibly using C#. If you prefer Linux or other systems and look to program in something more &#8216;outlandish&#8217;, e.g. your own assembler language then surely there is help out <a title="There are never enough assemblers!" href="http://www.google.com/search?q=%22my+assembler%22" target="_blank">there </a>but I am not addressing it here(, yet).<span id="more-16"></span></p>
<p><a title="Visual Studio Express 2008 C# Edition" rel="attachment wp-att-17" href="http://www.semantikoz.com/2008/03/05/programming-on-a-shoestring/visual-studio-express-2008-c-edition/"><img class="alignleft" src="http://www.semantikoz.com/wp-content/uploads/2008/03/vs2008.thumbnail.jpg" alt="Visual Studio Express 2008 C# Edition" /></a>For starters you would need an <a title="Integrated Development Environment (IDE) at Wikipedia" href="http://en.wikipedia.org/wiki/Integrated_development_environment" target="_blank">Integrated Development Environment (IDE)</a>. It allows you to write code, save, organise, build, compile and debug it. I will not bore you too much because very likely you have heard already about IDEs. One that is hard to beat especially if you are into <a href="http://en.wikipedia.org/wiki/C_Sharp_%28programming_language%29" target="_blank">C#</a>, <a href="http://en.wikipedia.org/wiki/C%2B%2B" target="_blank">C++</a>, <a href="http://en.wikipedia.org/wiki/ASP.NET" target="_blank">ASP.NET</a> or <a href="http://en.wikipedia.org/wiki/Visual_basic" target="_blank">VB</a> is Microsoft&#8217;s Visual Studio. The good news is that there is a free version, <a title="Download a free Visual Studio Express IDE from Microsoft" href="http://www.microsoft.com/express/" target="_blank">Visual Studio 2008 Express</a>, out there.  The bad news is that it is stripped of many features. Some of them quite useful to most of us. Even if you &#8216;only&#8217; develop your student project, thesis or start up self-employed work. Luckily, there are nice people out there who share their solutions to their/our problems.</p>
<p><a title="TortoiseSVN right click context menu" rel="attachment wp-att-18" href="http://www.semantikoz.com/2008/03/05/programming-on-a-shoestring/tortoisesvn-right-click-context-menu/"><img class="alignright" src="http://www.semantikoz.com/wp-content/uploads/2008/03/tortoisesvn.thumbnail.jpg" alt="TortoiseSVN right click context menu" /></a>One of the most useful tools in software development are <a title="Version Control Systems at Wikipedia" href="http://en.wikipedia.org/wiki/Version_control_system" target="_blank">version or revision control systems</a>. They keep track of changes in files and if necessary help you organise several people work on several versions of the same files. Even if you are working alone the benefits are tremendous. Assume you have two computers one at home/desktop and one at university/work/laptop. With a version control system and internet access, you can install a version control system and synchronise you your work from wherever you are. If you suddenly share your work with a friend then (s)he can contribute under a new user name and you can exchange your work and stay updated. Most importantly you have a central repository of your work which works automatically as double backup. Firstly it will be on a different machine and secondly it will allow you to reverse to any version of any file which is the ultimate &#8216;Undo&#8217; function.</p>
<ul>
<li>So what do you use? Well there are several out there but I am very happy with using <a title="Subversion Homepage" href="http://subversion.tigris.org/" target="_blank">Subversion</a>.</li>
<li>Looks complicated? Some web site providers throw in free subversion   (or cvs) repositories which can be a good way to start.</li>
<li>How do you use it on your machine without getting into the dark world of command line tools? On your Windows machine use <a title="TortoiseSVN Homepage" href="http://tortoisesvn.tigris.org/" target="_blank">TortoiseSVN </a>to connect with your repository through simple right click context menu on any folder/file.</li>
</ul>
<p>You could go ahead and start your work now but I would recommend at least one more tool. Independent of your programming style a unit test is always a good idea. You might be the <a title="TDD at Wikipedia" href="http://en.wikipedia.org/wiki/Test-driven_development" target="_blank">Test-Driven Development (TDD)</a> or the write first test later type. Even if latter you should and will need to do a unit test if you want to keep your code stable and free of as many bugs as possible. Use an <a href="http://en.wikipedia.org/wiki/Test_automation" target="_blank">automated unit test</a> tool like <a title="NUnit Homepage" href="http://www.nunit.org/index.php" target="_blank">NUnit</a> (if you are using C# or <a title="csUnit Homepage" href="http://www.csunit.org/" target="_blank">csUnit</a> for any .NET language) to do the hard work for you once you have written your tests. One downside is that the IDE integration does not work because Visual Studio Express does not allow plugins to my knowledge. All that means is that you have to start the unit test program yourself but what is a click every now and then when it is for free!</p>
<p><a title="IJW Profiler" href="http://www.semantikoz.com/wp-content/uploads/2008/03/ijw-profiler.jpg"><img class="alignleft" src="http://www.semantikoz.com/wp-content/uploads/2008/03/ijw-profiler.thumbnail.jpg" alt="IJW Profiler" /></a>My last recommendation, a profiler, is not a necessity but can be very useful for some and teaches you a lot at the same time.  If you have a performance sensitive application a profiler will be essential to work out where you waste time and where it is worth to optimise. A very good and  (currently until released in version 1.0) free one is <a title="IJW Profiler Homepage" href="http://www.ijw.co.nz/profiler.htm" target="_blank">IJW Profiler</a> which can deal with Java and C# applications. I hope they keep it free for private and academic use. Just run your application from the profiler and it will keep a track of how long what in your application took allowing you to track down the parts that take the most time and optimise them.</p>
<img src="http://feeds.semantikoz.com/~r/Semantikoz/~4/246037739" height="1" width="1"/>]]></content:encoded>
			<wfw:commentRss>http://www.semantikoz.com/2008/03/05/programming-on-a-shoestring/feed/</wfw:commentRss>
		</item>
		<item>
		<title>Voronoi/Voronoy Tessellation</title>
		<link>http://www.semantikoz.com/2008/02/28/voronoivoronoy-tessellation/</link>
		<comments>http://www.semantikoz.com/2008/02/28/voronoivoronoy-tessellation/#comments</comments>
		<pubDate>Thu, 28 Feb 2008 03:09:43 +0000</pubDate>
		<dc:creator>CWP</dc:creator>
		
		<category><![CDATA[misc]]></category>

		<category><![CDATA[clustering]]></category>

		<category><![CDATA[geometry]]></category>

		<category><![CDATA[vector space]]></category>

		<category><![CDATA[voronoy]]></category>

		<guid isPermaLink="false">http://www.semantikoz.com/archives/11</guid>
		<description><![CDATA[
The Voronoy (or Voronoi) Tessellation (Voronoy 1908) is a technique that enables the division of a such multi-dimensional spaces into subspaces. Its application defines geometric areas equivalent to subspaces by defining several vectors as centres of subspaces. Any other vector in space can then be attributed to the closest centre vector effectively dividing the whole [...]]]></description>
			<content:encoded><![CDATA[<p><img class="alignleft" src="http://www.semantikoz.com/wp-content/uploads/2008/02/coloured_voronoi_2d.thumbnail.png" alt="The Voronoy Tessellation of a random set of points in the plane (all points lie within the image). [Source: http://en.wikipedia.org/wiki/Image:Coloured_Voronoi_2D.png, GNU Free Documentation license]" /></p>
<p>The <a title="Voronoi Diagrams at Wikipedia" href="http://en.wikipedia.org/wiki/Voronoi_diagram">Voronoy (or Voronoi) Tessellation</a> (<a href="http://www.semantikoz.com/biblio#voronoy1908">Voronoy 1908</a>) is a technique that enables the division of a such multi-dimensional spaces into subspaces. Its application defines geometric areas equivalent to subspaces by defining several vectors as centres of subspaces. Any other vector in space can then be attributed to the closest centre vector effectively dividing the whole space in subspaces. Thus an excellent choice to divide semantic vector spaces.</p>
<p><span id="more-11"></span></p>
<blockquote><p>Voronoi diagrams and Delaunay tessellations are one of a few truly interdisciplinary concepts with relevant material to be found in, but not limited to, anthropology, archaeology, astronomy, biology, cartography, chemistry, computational geometry, crystallography, ecology, forestry, geography, geology, linguistics, marketing, metallography, meteorology, operations research, physics, physiology, remote sensing, statistics, and urban and regional planning. (<a title="Okabe Atsuyuki, Boots Barry, Sugihara Kokichi &amp; Chiu Sung Nok, 2000, Concepts and Applications of Voronoi Diagrams, 2nd Edition, John Wiley" href="http://www.semantikoz.com/biblio#okabe2000" target="_blank">Okabe, Boots, Sugihara and Chiu, 2000</a>)</p></blockquote>
<p><a title="2D Voronoy Tessellation" rel="attachment wp-att-13" href="http://www.semantikoz.com/2008/02/28/voronoivoronoy-tessellation/2d-voronoy-tessellation/"><img src="http://www.semantikoz.com/wp-content/uploads/2008/02/coloured_voronoi_2d.png" alt="2D Voronoy Tessellation" /></a></p>
<p align="center">2D <a href="/2008/02/28/voronoivoronoy-tessellation/" >Voronoy Tessellation</a></p>
<p>Aurenhammer (<a title="Aurenhammer Franz, 1991, Voronoi diagrams - survey of a fundamental geometric data structure, ACM Computing Surveys, vol. 23, no. 3, pp. 345 - 405" href="http://www.semantikoz.com/biblio#aurenhammer1991" target="_blank">1991</a>) describes <a href="/2008/02/28/voronoivoronoy-tessellation/" >Voronoy Tessellation</a> as “one of the most fundamental data structures in computational geometry” which are used in modelling natural phenomena, to investigate “their mathematical, in particular, geometrical, combinatorial, and stochastic properties” and their computational representation. It also offers various methods for clustering of multi-dimensional data.</p>
<p align="center"><a title="Georgy Voronoy at Wikipedia" href="http://en.wikipedia.org/wiki/Georgy_Voronoy" target="_blank">Georgy Feodosevich Voronoy/<span lang="ru" xml:lang="ru">Вороной Георгий Феодосьевич</span></a></p>
<img src="http://feeds.semantikoz.com/~r/Semantikoz/~4/246037740" height="1" width="1"/>]]></content:encoded>
			<wfw:commentRss>http://www.semantikoz.com/2008/02/28/voronoivoronoy-tessellation/feed/</wfw:commentRss>
		</item>
		<item>
		<title>Information Mapping Project (INFOMAP)</title>
		<link>http://www.semantikoz.com/2008/02/26/information-mapping-project-infomap/</link>
		<comments>http://www.semantikoz.com/2008/02/26/information-mapping-project-infomap/#comments</comments>
		<pubDate>Tue, 26 Feb 2008 05:01:10 +0000</pubDate>
		<dc:creator>CWP</dc:creator>
		
		<category><![CDATA[semantics]]></category>

		<category><![CDATA[hal]]></category>

		<category><![CDATA[infomap]]></category>

		<category><![CDATA[lsa]]></category>

		<category><![CDATA[matrix]]></category>

		<category><![CDATA[semantic space]]></category>

		<category><![CDATA[semantic vector space]]></category>

		<category><![CDATA[svd]]></category>

		<category><![CDATA[vector space]]></category>

		<category><![CDATA[vector space model]]></category>

		<guid isPermaLink="false">http://www.semantikoz.com/archives/8</guid>
		<description><![CDATA[The INFOMAP project is an older but nevertheless interesting introduction into semantic vector space models. The related software is freely available. It uses a combination of approaches but mostly relies on Schütze&#8217;s Automatic word sense discrimination work. However, it does not use context vectors and concentrates on a SVD compressed HAL matrix.
In my research I [...]]]></description>
			<content:encoded><![CDATA[<p>The <a title="Information Mapping Project Homepage" href="http://www-csli.stanford.edu/semlab-hold/infomap.html" target="_blank">INFOMAP project</a> is an older but nevertheless interesting introduction into semantic vector space models. The related <a title="INFOMAP Software" href="http://infomap-nlp.sourceforge.net/" target="_blank">software</a> is freely available. It uses a combination of approaches but mostly relies on <a href="http://www.semantikoz.com/2008/02/26/automatic-word-sense-discrimination/" target="_blank">Schütze</a><a title="Schütze's Automatic word sense discrimination paper" href="http://www.semantikoz.com/2008/02/26/automatic-word-sense-discrimination/" target="_blank">&#8217;s Automatic word sense discrimination</a> work. However, it does not use context vectors and concentrates on a <a href="/2008/02/26/the-mystery-of-singular-value-decomposition/" >SVD</a> compressed <a href="/2008/02/25/hyperspace-analogue-to-language-hal-introduction/" >HAL</a> matrix.<span id="more-8"></span></p>
<p>In my research I intensely worked with it and found it to be useful with large corpora. It parses a corpus of a or several documents and generates the word vectors in a <a href="/2008/02/25/hyperspace-analogue-to-language-hal-introduction/" >HAL</a> matrix excluding words contained in a stop list. A stop list is a collection of  words and letters that have ambiguous features and are semantically expressionless, for example <em>I</em>, <em>you</em>, <em>are</em>, <em>a</em>, <em>b</em>, <em>c</em>, <em>d</em> and so on. This allows to use a simple parser and despite the lack of stemming reduces the amount of word significantly. To limit the number of columns and rows (which can be set by a parameter) frequency is used. So only the <em>x</em> most frequent words are used for columns and <em>y</em> most frequent ones for the rows. The columns have an additional gap feature where by default the 50 most frequent words are ignored. According to <a href="/2008/02/26/information-mapping-project-infomap/" >INFOMAP</a> these words often are ambiguous because of their high frequency. The <a href="/2008/02/25/hyperspace-analogue-to-language-hal-introduction/" >HAL</a> matrix is not a simple count of occurrences of words but uses a <a title="TF-IDF at Wikipedia" href="http://en.wikipedia.org/wiki/TFIDF" target="_blank">Term Frequency - Inverse Document Frequency</a> (TF-IDF) measure to weight a word and uses this value when parsing the text and incrementing the matrix.</p>
<p>Once the corpus is parsed and the <a href="/2008/02/25/hyperspace-analogue-to-language-hal-introduction/" >HAL</a> matrix computed a <a href="/2008/02/26/the-mystery-of-singular-value-decomposition/" >SVD</a> based on a Lanczos algorithm is performed on it. The resulting left matrix U is then truncated to the columns pre-set in the parameters (default 100) or less if the Lanczos algorithm converges earlier.</p>
<p>Documents are mapped into the space once the <a href="/2008/02/26/the-mystery-of-singular-value-decomposition/" >SVD</a> and dimensional reduction is completed. Each document vector is a summation of the word( vector)s contained in a document. The query engine of the software allows to query for terms or combination of terms and finds the closest and documents. A nice extra feature is the implementation of a logical NOT to the query engine. So one could query <em>suite</em> NOT <em>clothes</em> to remove possible clothing meaning from the query and focus on alternative meanings like a lawsuit. This is done by creating the query vector of the first part of the query and making it orthogonal to the second part, the NOT, of the query. As a result the final query vector will be orthogonal (unrelated) to the NOT part but retain all other information of the positive part of the query. This simple but brilliant approach was developed by Dominic Widdows and published in <a title="Dominic Widdow's paper to negation in vector spaces" href="http://www.puttypeg.com/papers/negation-ir.pdf" target="_blank">Orthogonal Negation in Vector Spaces for Modelling Word-Meanings and   Document Retrieval</a>.</p>
<p>While the results of <a href="/2008/02/26/information-mapping-project-infomap/" >INFOMAP</a> are good and intuitively right, my research has revealed some shortcomings and subsequently lead me to develop my own implementation with several improvements. I will not discuss the details of the problems as its part of my thesis and maybe part of publications to come. Nevertheless, it is a great starting point for anyone interested in playing around with semantic vector spaces. I certainly gained a great insight by using it.</p>
<img src="http://feeds.semantikoz.com/~r/Semantikoz/~4/246037741" height="1" width="1"/>]]></content:encoded>
			<wfw:commentRss>http://www.semantikoz.com/2008/02/26/information-mapping-project-infomap/feed/</wfw:commentRss>
		</item>
		<item>
		<title>Automatic word sense discrimination</title>
		<link>http://www.semantikoz.com/2008/02/26/automatic-word-sense-discrimination/</link>
		<comments>http://www.semantikoz.com/2008/02/26/automatic-word-sense-discrimination/#comments</comments>
		<pubDate>Tue, 26 Feb 2008 03:18:57 +0000</pubDate>
		<dc:creator>CWP</dc:creator>
		
		<category><![CDATA[semantics]]></category>

		<category><![CDATA[hal]]></category>

		<category><![CDATA[matrix]]></category>

		<category><![CDATA[semantic space]]></category>

		<category><![CDATA[semantic vector space]]></category>

		<category><![CDATA[svd]]></category>

		<category><![CDATA[vector space]]></category>

		<category><![CDATA[vector space model]]></category>

		<guid isPermaLink="false">http://www.semantikoz.com/archives/9</guid>
		<description><![CDATA[Automatic word sense discrimination was publish in 1998 by Hinrich Schütze and can be seen as a further development of the HAL approach. He calls the underlying semantic vector space, Word Space, but it relates to the same basic matrix of word co-occurrences in a word by word matrix. His aim is to identify Senses [...]]]></description>
			<content:encoded><![CDATA[<p><a href="http://portal.acm.org/citation.cfm?id=972724&amp;dl=" target="_blank">Automatic word sense discrimination</a> was publish in 1998 by <a href="http://www.ims.uni-stuttgart.de/~schuetze/" target="_blank">Hinrich Schütze</a> and can be seen as a further development of the <a href="/2008/02/25/hyperspace-analogue-to-language-hal-introduction/" >HAL</a> approach. He calls the underlying semantic vector space, <em>Word Space</em>, but it relates to the same basic matrix of word co-occurrences in a word by word matrix. His aim is to identify <em>Senses </em>in the vector spaces which one could imagine to be categories or topics. Furthermore, his approach attempts to attribute occurrences of ambiguous words to <em>Senses</em>.<span id="more-9"></span></p>
<p>Schütze introduces <em>Context Vectors</em> which are second order co-occurrences while <em>Word Vectors</em> are first order. Word vectors are created in form of a <a href="/2008/02/25/hyperspace-analogue-to-language-hal-introduction/">HAL</a> matrix. Context vectors are a summation of word vectors close to the a single occurrence of the word under investigation. As a result word vectors a general representation of a word while a context vector is representation of a context of a single occurrence of a word. Latter are more focused and also only valid for the word in the particular context.</p>
<p>Context vectors are then clustered to identify areas of meaning which is a collection of close context vectors. The centre of such a cluster according to Schütze is a <em>Sense</em>. An example he makes illustrates this. The context vectors of <em>suite </em>might be attribute to different senses. If <em>suite </em>has a legal context and appears with words like <em>judge </em>and <em>law </em>it would be attributed to a sense vector (topic) representing legal meanings. Another time the word might be encountered surrounded by word like <em>tailor </em>and <em>shirt </em>resulting in an attribution of the context vector to a clothing sense.</p>
<p>To reduce the dimensionality of his matrix and take advantage of its positive characteristics Schütze employs Singular Value Decomposition (<a href="/2008/02/26/the-mystery-of-singular-value-decomposition/" >SVD</a>). He assumes it helps to uncover latent meaning. I would tend to attribute <a href="/2008/02/26/the-mystery-of-singular-value-decomposition/">SVD&#8217;s positive influence</a> to a combination of amplification and noise filtering of the matrix. If I understood his paper <a href="/2008/02/26/the-mystery-of-singular-value-decomposition/" >SVD</a> is only employed on the initial word matrix and not the context matrix. This would make sense as latter should have been much less sparse than former.</p>
<p>To test his work Schütze uses pseudo-words. To construct them he picks two word( vector)s which have very little in common, e.g. <em>door </em>and <em>banana</em>, and conflates them into one pseudo-word. Once he parses his text and clusters it, it allows him to easily identify if a context of the pseudo-word was related to <em>door </em>or <em>banana </em>and as such attributed to the right sense. His results show that these artificial ambiguous words are identified to a higher degree than the natural ambiguous words like <em>suite</em>. Furthermore, abstract senses that are encountered in words like <em>space </em>or pairs like <em>wide range</em> are harder to attribute. This is likely due to their appearance in more contexts than other words and as a result higher ambiguity.</p>
<img src="http://feeds.semantikoz.com/~r/Semantikoz/~4/246037742" height="1" width="1"/>]]></content:encoded>
			<wfw:commentRss>http://www.semantikoz.com/2008/02/26/automatic-word-sense-discrimination/feed/</wfw:commentRss>
		</item>
		<item>
		<title>The Mystery of Singular Value Decomposition</title>
		<link>http://www.semantikoz.com/2008/02/26/the-mystery-of-singular-value-decomposition/</link>
		<comments>http://www.semantikoz.com/2008/02/26/the-mystery-of-singular-value-decomposition/#comments</comments>
		<pubDate>Tue, 26 Feb 2008 00:07:58 +0000</pubDate>
		<dc:creator>CWP</dc:creator>
		
		<category><![CDATA[semantics]]></category>

		<category><![CDATA[hal]]></category>

		<category><![CDATA[lsa]]></category>

		<category><![CDATA[matrix]]></category>

		<category><![CDATA[semantic space]]></category>

		<category><![CDATA[semantic vector space]]></category>

		<category><![CDATA[svd]]></category>

		<category><![CDATA[vector space]]></category>

		<category><![CDATA[vector space model]]></category>

		<guid isPermaLink="false">http://www.semantikoz.com/archives/10</guid>
		<description><![CDATA[Apperceptual comments on an interesting problem in one of his blog posts. He is discussing the importance of high order co-occurrences on word similarity measures in LSA. The part that interested me was the discussion of Singular Value Decomposition (SVD). My gut feeling has always been that SVD&#8217;s most useful characteristic was to amplify the [...]]]></description>
			<content:encoded><![CDATA[<p><img class="alignleft" src="http://www.semantikoz.com/wp-content/uploads/2008/03/mysterybw128.jpg" alt="Mystery" /><a title="Peter Turney's Blog" href="http://apperceptual.wordpress.com/" target="_blank">Apperceptual</a> comments on an interesting problem in <a title="Why Does SVD Improve Similarity Measurement?" href="http://apperceptual.wordpress.com/2007/01/24/why-does-svd-improve-similarity-measurement/" target="_blank">one of his blog posts</a>. He is discussing the importance of high order co-occurrences on word similarity measures in LSA. The part that interested me was the discussion of Singular Value Decomposition (<a href="http://en.wikipedia.org/wiki/Singular_value_decomposition">SVD</a>). My gut feeling has always been that SVD&#8217;s most useful characteristic was to amplify the  information content and reduces noise. Certainly an interesting question that comes to mind is how to measure such an improvement. A dimensional reduction (or for that matter any noise reduction) is only useful when applied appropriately or it falls short of its ability or worse reduces the (useful) information content. To test this run a semantic vector space with increasingly harsh dimensional reduction on the vector space. The vectors start focusing, then clumping until the reduction is too high and they collapse on a handful of dimensions.<span id="more-10"></span></p>
<p>The two other points he makes are latent meaning being embedded in the columns of the matrix as well as high order co-occurences. Latter appears to be disputed by <a title="Landauer at University of Colorado at Bolder " href="http://dirwww.colorado.edu/whitepages/ldapdrill.xml?cnfull=100034363" target="_blank">Landauer</a> and as explained by Apperceptual appears to have little influence. I am not certain how much difference there is between latent meaning and high order co-occurrences. It might very well be that these two are closely linked if not the same. If one thinks about Hinrich Schütze&#8217;s <a href="http://www.semantikoz.com/2008/02/26/automatic-word-sense-discrimination/">Automatic word sense discrimination</a> it seems to make a similar point with context and second order co-occurences. Assuming the context of a word is similar to the mentioned columns and the contained latent meanings then one could argue that they are nothing more than high order co-occurrences. To be honest it is not a completely accurate comparison as Schütze bases his work on a <a href="http://www.semantikoz.com/2008/02/25/hyperspace-analogue-to-language-hal-introduction/">HAL</a> matrix and not LSA.</p>
<img src="http://feeds.semantikoz.com/~r/Semantikoz/~4/246037744" height="1" width="1"/>]]></content:encoded>
			<wfw:commentRss>http://www.semantikoz.com/2008/02/26/the-mystery-of-singular-value-decomposition/feed/</wfw:commentRss>
		</item>
		<item>
		<title>Hyperspace Analogue to Language (HAL) Introduction</title>
		<link>http://www.semantikoz.com/2008/02/25/hyperspace-analogue-to-language-hal-introduction/</link>
		<comments>http://www.semantikoz.com/2008/02/25/hyperspace-analogue-to-language-hal-introduction/#comments</comments>
		<pubDate>Mon, 25 Feb 2008 04:52:25 +0000</pubDate>
		<dc:creator>CWP</dc:creator>
		
		<category><![CDATA[semantics]]></category>

		<category><![CDATA[hal]]></category>

		<category><![CDATA[matrix]]></category>

		<category><![CDATA[semantic space]]></category>

		<category><![CDATA[semantic vector space]]></category>

		<category><![CDATA[vector space]]></category>

		<category><![CDATA[vector space model]]></category>

		<guid isPermaLink="false">http://www.semantikoz.com/archives/4</guid>
		<description><![CDATA[Also known as semantic memory it was developed by Kevin Lund and Curt Burgress from the University of California, Riverside, California. You can download the corresponding paper, Producing high-dimensional semantic spaces from lexical co-occurrence, in PDF format.
The basic premise the work relies on is that words with similar meaning repeatedly occur closely (also known as [...]]]></description>
			<content:encoded><![CDATA[<p><img src="http://www.semantikoz.com/wp-content/uploads/2008/03/redhal-128.jpg" class="alignright" alt="Red HAL" />Also known as semantic memory it was developed by Kevin Lund and Curt Burgress from the University of California, Riverside, California. You can download the corresponding paper, <a href="http://www.psychonomic.org/search/view.cgi?id=1105" title="Producing high-dimensional semantic spaces from lexical co-occurrence" target="_blank">Producing high-dimensional semantic spaces from lexical co-occurrence</a>, in PDF format.</p>
<p>The basic premise the work relies on is that words with similar meaning repeatedly occur closely (also known as co-occurrence). As an example in a large corpus of text one could expect to see the words <em>mountain</em>, <em>valley </em>and <em>river </em>appear often close to each other. The same might be true for <em>mouse</em>, <em>cat </em>and <em>dog</em>. <span id="more-4"></span></p>
<p>One could now create a square matrix of a text where all unique words <em>n</em> are represented as a row and column. Now we can read the text and every time we read a new word we look its row vector up in the matrix. Then we take <em>x</em> words on the right and on the left and increment the corresponding column in the row vector for each word. This is a simple sliding window parsing. We can also account for the closeness of a word by incrementing by a  larger number for a word closer to the centre word, e.g. a word next to the centre could result in an increment of 5 in its column and a word 4 words away could result in a 2 increment.</p>
<p align="center">&nbsp;</p>
<p align="center"><img src="http://www.semantikoz.com/wp-content/uploads/2008/02/hal-matrix.png" alt="HAL Matrix Example" /><br />
Naive <a href="/2008/02/25/hyperspace-analogue-to-language-hal-introduction/" >HAL</a> Matrix</p>
<p>As a result words co-occurring have similar rows. If we look at the simplified example in the above matrix we can see that <em>mountain</em>, <em>valley </em>and <em>river </em>have similar rows and so do <em>mouse</em>, <em>cat </em>and <em>dog</em>. These rows can be interpreted as vectors with <em>n</em> dimensions. The &#8220;distance&#8221; between vectors then becomes a proxy for the similarity of meanings of the words represented by the vectors. The &#8220;distance&#8221; often is measured as the cosine of the angle between two vectors. As a result identical vectors, pointing in the same direction, have an angle of 0 degrees and a cosine value of 1. Unrelated vectors would be orthogonal with an angle of 90 degrees and a cosine value of 0. To ease the cosine calculation matrices are often normalised along the rows to the unit length of 1 of the row vectors.</p>
<p>Following the example it also shows that even words not directly co-occurring can share meaning. <em>Dog </em>for example does not appear close to <em>mouse </em>but through its shared meaning with <em>cat </em>also shares meaning with <em>mouse</em>. As a result one can easily group words by their meaning even if they share it only indirectly.</p>
<p>While similar experiments had been done before Lund and Burgess published their work it still was a great breakthrough. Their approach is completely automated and opposite to earlier work does not rely on humans selecting dimensions and training semantic vector spaces. Only the information in a corpus is used to create the matrix and the resulting vector space and thus has no external bias through influence by human actors.</p>
<img src="http://feeds.semantikoz.com/~r/Semantikoz/~4/246037745" height="1" width="1"/>]]></content:encoded>
			<wfw:commentRss>http://www.semantikoz.com/2008/02/25/hyperspace-analogue-to-language-hal-introduction/feed/</wfw:commentRss>
		</item>
		<item>
		<title>Welcome</title>
		<link>http://www.semantikoz.com/2008/02/24/welcome/</link>
		<comments>http://www.semantikoz.com/2008/02/24/welcome/#comments</comments>
		<pubDate>Sun, 24 Feb 2008 23:12:11 +0000</pubDate>
		<dc:creator>CWP</dc:creator>
		
		<category><![CDATA[semantikoz]]></category>

		<category><![CDATA[misc]]></category>

		<guid isPermaLink="false">http://www.semantikoz.com/?p=3</guid>
		<description><![CDATA[Thanks for dropping by and reading this blog. Are you wondering what it is all about and why you should read it and not one of the other millions of blogs? Well that makes two of us. I do not plan to waste your and my time with random ramblings but rather will write about [...]]]></description>
			<content:encoded><![CDATA[<p>Thanks for dropping by and reading this blog. Are you wondering what it is all about and why you should read it and not one of the other millions of blogs? Well that makes two of us. I do not plan to waste your and my time with random ramblings but rather will write about what I work on and what interests me. That might be anything from semantic research to travelling. If you are still reading then you should be interested enough to check out one of my posts before you head off again.</p>

<span class="slashdigglicious">
<a href="http://slashdot.org/bookmark.pl?url=http%3A%2F%2Fwww.semantikoz.com%2F2008%2F02%2F24%2Fwelcome%2F&amp;title=Welcome" title="Slashdot It!"><img src="http://slashdot.org/favicon.ico" height="16" width="16" alt="[Slashdot]" /></a>
<a href="http://digg.com/submit?phase=2&amp;url=http%3A%2F%2Fwww.semantikoz.com%2F2008%2F02%2F24%2Fwelcome%2F&amp;title=Welcome" title="Digg This Story"><img src="http://digg.com/favicon.ico" width="16" height="16" alt="[Digg]" /></a>
<a href="http://reddit.com/submit?url=http%3A%2F%2Fwww.semantikoz.com%2F2008%2F02%2F24%2Fwelcome%2F&amp;title=Welcome" title="Reddit"><img src="http://reddit.com/favicon.ico" width="16" height="16" alt="[Reddit]" /></a>
<a href="http://del.icio.us/post?url=http%3A%2F%2Fwww.semantikoz.com%2F2008%2F02%2F24%2Fwelcome%2F&amp;title=Welcome" title="Save to del.icio.us" onclick="window.open('http://del.icio.us/post?v=4&amp;noui&amp;jump=close&amp;url=http%3A%2F%2Fwww.semantikoz.com%2F2008%2F02%2F24%2Fwelcome%2F&amp;title=Welcome', 'delicious', 'toolbar=no,width=700,height=400'); return false;"><img src="http://images.del.icio.us/static/img/delicious.small.gif" width="16" height="16" alt="[del.icio.us]" /></a>
<a href="http://www.facebook.com/share.php?u=http%3A%2F%2Fwww.semantikoz.com%2F2008%2F02%2F24%2Fwelcome%2F" title="Share on Facebook"><img src="http://www.facebook.com/favicon.ico" width="16" height="16" alt="[Facebook]" /></a>
<a href="http://technorati.com/faves?add=http%3A%2F%2Fwww.semantikoz.com%2F2008%2F02%2F24%2Fwelcome%2F" title="Add to my Technorati Favorites"><img src="http://technorati.com/favicon.ico" width="16" height="16" alt="[Technorati]" /></a>
<a href="http://www.google.com/bookmarks/mark?op=edit&amp;output=popup&amp;bkmk=http%3A%2F%2Fwww.semantikoz.com%2F2008%2F02%2F24%2Fwelcome%2F&amp;title=Welcome" title="Save to Google Bookmarks"><img src="http://www.google.com/favicon.ico" width="16" height="16" alt="[Google]" /></a>
<a href="http://www.stumbleupon.com/submit?url=http%3A%2F%2Fwww.semantikoz.com%2F2008%2F02%2F24%2Fwelcome%2F&amp;title=Welcome" title="Stumble it!"><img src="http://www.stumbleupon.com/favicon.ico" width="16" height="16" alt="[StumbleUpon]" /></a>
</span><img src="http://feeds.semantikoz.com/~r/Semantikoz/~4/246037746" height="1" width="1"/>]]></content:encoded>
			<wfw:commentRss>http://www.semantikoz.com/2008/02/24/welcome/feed/</wfw:commentRss>
		</item>
	</channel>
</rss>
