homepage/public/assets/embedded/phdpresentation/pres.html

<!doctype html>

<html lang="en">
<head>
    <meta charset="utf-8" />
    <meta name="viewport" content="width=1024" />
    <meta name="apple-mobile-web-app-capable" content="yes" />
    <title>Joachim Nielandt - Public defence</title>

    <meta name="description" content="supported by impress.js." />
    <meta name="author" content="Joachim Nielandt" />

    <link href="pres.css" rel="stylesheet" />
    <link href="css/pres2.css" rel="stylesheet" />
    <link href="pres2.css" rel="stylesheet" />

    <link rel="stylesheet" href="node_modules/font-awesome/css/font-awesome.min.css">

    <link rel="shortcut icon" href="favicon.png" />
    <link rel="apple-touch-icon" href="apple-touch-icon.png" />

    <script src="node_modules/jquery/dist/jquery.min.js"></script>


     <script type="text/javascript">
        function writeTitle(title, subtitle) {
            document.write('<span class="title">');
            if(title!='') {
                document.write(title);
            }
            document.write('<small>');
            if(subtitle!='') {
                document.write('<i class="fa fa-chevron-right" aria-hidden="true"></i>'+subtitle);
            }
            document.write('</small></span>');
        }
    </script>

     <script type="text/javascript">
        $( document ).ready(function() {
                console.log( "ready!" );
                var xresult = document.evaluate('//div[@menutext]' ,document, null, XPathResult.ANY_TYPE, null );
                var xres;
                var menu = $("#steplinks");
                // console.log('menu');
                // console.log(menu);
                var menuItems = [];
                while (xres = xresult.iterateNext()) {
                    var item = "<a href='./pres.html#/"+xres.id+"'>"+jQuery(xres).attr('menutext')+"</a>";
                    menuItems.push(item);
                }
                for(var i = 0; i<menuItems.length; i++) {
                    menu.append(menuItems[i]);
                }
            });

    </script>
</head>

<body class="impress-not-supported">

        <!--<a href="./pres.html#/firststep">Start</a>
    <a href="./pres.html#/secondstep">Intro</a>
    <a href="./pres.html#/introducingwebpage1">Webpage</a>
    <a href="./pres.html#/introhtmlstep">HTML</a>-->

    <!--introducingthebrowser
    gotouserstep
    firstillustrated
    introducingxpath
    introducingeditdistances
    problem1step -> genxpath
    problem2step-->


<div id="steplinks">

</div>

<div id="impress">

    <div class = "step" id="firststep" menutext="Start" data-x="-200" data-y="-1400">
        <div class="headertitle">Public PhD defence</div>
        <div class="of">of <span class="name">Joachim Nielandt</span></div>
        <div class="phdtitle">XPath-Based Information Extraction</div>
    </div>

    <div class = "step" id="secondstep" menutext="Situation" data-x="-200" data-y="-800">
        <script type="text/javascript">writeTitle('Introduction', 'Situation');</script>
        <ul>
            <li>Extraction of data from web pages</li>
            <li>Usually not easy to do</li>
            <li>Present the data in orderly fashion</li>
            <li>So it can be used for other purposes</li>
        </ul>
    </div>

    <div class = "step" id="thirdstep" data-x="-200" data-y="0">
        <script type="text/javascript">writeTitle('Introduction', 'Overview');</script>
        <p>Three problems are answered in this thesis</p>
        <ol>
            <li>Can a data extractor be built, using user examples and XPath?</li>
            <li>Is it possible to increase the quality using context?</li>
            <li>Can automation be built into the system?</li>
        </ol>
    </div>

    <div class = "step" id="introducingwebpage1" data-x="1000" data-y="0">
        <script type="text/javascript">writeTitle('Introducing', 'A Web page');</script>
        <!--<span class="title">Introducing...<small>A web page</small></span>-->
        <ul>
            <li>Meant to be viewed by human users</li>
            <li>Contains a lot of structure</li>
            <li>Allows for interaction</li>
        </ul>
        <img id="fullimdbscreenshot" src="img/fullimdbscreenshot.png" />
    </div>

    <div class="step introducingwebpage" menutext="Webpage" id="introducingwebpage2" data-x="1600" data-y="100">
        <p>What does it take to get a web page to a user?</p>
        <div id="imgblocks">
            <div class="imgblock">
            <h1 class="imgtitle">Server</h1>
                <h2 class="subtitle">www.google.com</h2>
                <div id="testimgserver" class="pixeldiv"></div>
            </div>
            <div class="arrowblock">
                <p class="text">HTML</p>
                <p class="arrow">→</p>
            </div>
            <div class="imgblock">
                <h1 class="imgtitle">Browser</h1>
                <h2 class="subtitle">Google Chrome</h2>
                <div id="testimgbrowser" class="pixeldiv"></div>
            </div>
             <div class="arrowblock">
                <p class="text">Visual</p>
                <p class="arrow">→</p>
            </div>
            <div class="imgblock">
                <h1 class="imgtitle">User</h1>
                <h2 class="subtitle">Internet detective</h2>
                <div id="testimgpanther" class="pixeldiv"></div>
            </div>
        </div>
    </div>

    <div class="step introducingserver" data-x="2000" data-y="800">
        <div id="introducingtheserverimg" class="pixeldiv"></div>
        <script type="text/javascript">writeTitle('Introducing', 'The server');</script>
        <p>The server typically serves <i>HTML</i> content</p>
        <ul>
            <li>Will be processed by the browser</li>
            <li>Looks like a bunch of code? It <i>is</i> a bunch of code</li>
            <li>Can be <i class="very">very</i> pretty</li>
        </ul>
    </div>

    <div class="step unicorn" id="unicornstep" data-x="2000" data-y="1150" data-scale="0.8">
        <div id="wuftext">
            <p>Just like unicorns are</p>
            <p><i>very</i><p>
            <p>pretty horses</p>
        </div>
        <div id="wufimg" class="pixeldiv"></div>
    </div>

    <div class="step" data-x="2000" id="introhtmlstep" data-y="1600">
        <script type="text/javascript">writeTitle('Introducing', 'HTML');</script>
        <p>This piece of the IMDB website...</p>
        <div id="biglebowskiscreenshot" class="pixeldiv"></div>
    </div>

    <div class="step" data-x="2000" data-y="2200">
        <p>Looks like this as HTML code.</p>
        <div id="biglebowskidomscreenshot" class="pixeldiv"></div>
    </div>

     <div id="htmlisbuilt" class="step whiteoverlay" data-x="1800" data-y="2150" data-scale="0.3">
        <h2 class="subtitle"><b>HTML code is built using <i>tags</i></b></h2>
        <table>
            <tr>
                <td><code>&lt;div&gt;</code></td><td> → </td><td>opens the tag</td>
            </tr>
            <tr>
                <td><code>&lt;/div&gt;</code></td><td> → </td><td>closes the tag</td>
            </tr>
            <tr><td colspan="3"><code>&lt;div&gt;</code>anything in between is <i>content</i><code>&lt;/div&gt;</code></td></tr>
            <tr><td colspan="3"><code>&lt;and attributes="have values"&gt;</code></td></tr>
        </table>
    </div>

    <div class="step" id="introducingthebrowser" menutext="Browser" data-x="2000" data-y="2800">
        <script type="text/javascript">writeTitle('Introducing', 'The browser');</script>
        <div id="imgblocks">
            <div class="imgblock">
            <h1 class="imgtitle">Server</h1>
                <h2 class="subtitle">www.google.com</h2>
                <div id="testimgserver" class="pixeldiv"></div>
            </div>
            <div class="arrowblock">
                <p class="text">HTML</p>
                <p class="arrow">→</p>
            </div>
            <div class="imgblock">
                <h1 class="imgtitle">Browser</h1>
                <h2 class="subtitle">Google Chrome</h2>
                <div id="testimgbrowser" class="pixeldiv"></div>
            </div>
             <div class="arrowblock">
                <p class="text">Visual</p>
                <p class="arrow">→</p>
            </div>
            <div class="imgblock">
                <h1 class="imgtitle">User</h1>
                <h2 class="subtitle">Internet detective</h2>
                <div id="testimgpanther" class="pixeldiv"></div>
            </div>
        </div>
    </div>

    <div class="step browserinfo" data-x="2000" data-y="3200">
        <p>The browser downloads the HTML and it...</p>
        <ul>
            <li>builds an internal DOM model,</li>
            <li>allows for interaction, modifications, ...</li>
            <li>produces a visual page for the user.</li>
        </ul>
    </div>

    <div id="detectivebubble1" class="step userintro" menutext="User" id="gotouserstep" data-x="2500" data-y="2900" data-scale="0.7">
        <div class="bubble eventually">Eventually this is about me, right?</div>
    </div>

    <div id="detectivebubble2" class="step userwishes" data-x="2500" data-y="2900" data-scale="0.7">
        <div class="bubble wishes">
            <ol>
                <li>I love movies</li>
                <li>I want to build my own cinema</li>
                <li>I need to track all the movies i own</li>
                <li>IMDB has the information, but I cannot easily use it</li>
            </ol>
        </div>
    </div>

    <div class="step" id="firstillustrated" menutext="Prob1" data-x="3500" data-y="2900">
        <script type="text/javascript">writeTitle('Problem 1', 'Illustrated');</script>
        <p>Extract all the stars</p>
        <div id="biglebowskiscreenshot" class="pixeldiv">
            <div id="star1" class="starhighlight"> </div>
            <div id="star2" class="starhighlight"> </div>
            <div id="star3" class="starhighlight"> </div>
        </div>
    </div>

    <div class="step" data-x="3500" data-y="3500">
        <script type="text/javascript">writeTitle('Problem 1', 'Context');</script>
        <div id="firstcontext">
            <div class="textcontainer">
                <p>Impractical to get all that information manually</p>
                <ul>
                    <li>There’s a lot of IMDB content: <i>4.1 million</i> titles</li>
                    <li>What do you do with <i>new</i> movies?</li>
                    <li>What about errors corrected by IMDB?</li>
                </ul>
            </div>
            <div id="shockface" class="pixeldiv"></div>
        </div>
    </div>

    <div class="step" data-x="3500" data-y="4200">
        <script type="text/javascript">writeTitle('Problem 1', 'Introducing');</script>
        <ul>
            <li>Let's make it easier!</li>
            <li>Have a program do the heavy lifting</li>
            <li>Ask the user for minimal input: a couple of examples</li>
        </ul>
        <div id="cpulifting" class="pixeldiv"></div>
    </div>

    <div class="step" id="assumestep" data-x="3500" data-y="5000">
        <p><b>Assume</b>: each movie page shows<br/> its stars in the same spot</p>
        <div id="biglebowskiscreenshotexamples" class="pixeldiv"></div>
    </div>

    <div class="step whiteoverlay" id="twoexamples" data-x="3600" data-y="5110" data-scale="0.5">
        <p>The user gives two examples, <i>Jeff</i> and <i>John</i>...</p>
        <div id="example1" class="starhighlight"> </div>
        <!--<div id="example2" class="starhighlight"> </div>-->
    </div>

    <div class="step whiteoverlay" id="julianneexample" data-x="3800" data-y="5137" data-scale="0.5">
        <p>... and the goal is to automatically find <i>Julianne</i>!</p>
        <small>Ideally, this process can then be performed <br/>on other movies' pages too</small>
        <div id="example3" class="starhighlight"> </div>
    </div>

    <div class="step" id="lookatdom" data-x="3900" data-y="5100" data-z="250" data-rotate-y="-40" data-scale="0.8">
        <p>Let's look at the underlying DOM structure</p>
        <!--<div id="lookatdomimg" class="pixeldiv"></div>-->
        <img src="img/lebowskistarsdom.png"/>
        <div id="jeffexample" class="starhighlight"> </div>
        <div id="johnexample" class="starhighlight"> </div>
    </div>

    <div class="step" id="lookatdomoutro" data-x="3800" data-y="5100" data-z="250" data-rotate-y="-40" data-scale="0.8">
        <ul>
            <li>We need to <i>build</i> a rule: <i>“Get all stars”</i></li>
            <li>Requires a way to extract elements from DOM</li>
        </ul>
    </div>


    <div class="step" id="introducingxpath" menutext="XPath" data-x="3500" data-y="5800">
        <script type="text/javascript">writeTitle('Introducing', 'XPath');</script>
        <ul>
            <li>XPath is a query language</li>
            <li>Can be used to retrieve elements from a structured document</li>
            <ul><li>... such as the DOM model of a web page</li></ul>
            <li>Makes use of the node names in the DOM (<i>span</i>, <i>a</i>, <i>h4</i> ...)</li>
        </ul>
    </div>

    <div class="step" id="jeffbridgeszoomstep" data-x="3500" data-y="6270" data-scale="1.2">
        <div id="jeffbridgeszoom" class="pixeldiv"></div>
        <p class="bigger">An XPath example...</p>
        <p>This XPath selects <i>Jeff Bridges</i> from the HTML example:</p>
        <p class="xpath">/span/a/span/text()</p>
    </div>

    <div class="step" id="introducingxpath2" data-x="3500" data-y="6750">
        <p>XPaths can be very complex, but for now we keep to this structure:</p>
        <p class="xpath">/step1/step2/step3</p>
        <p>A step looks like this:</p>
        <p class="xpath">axis::nodetest[predicate]</p>
        <p>The full version of the example XPath would be:</p>
        <p class="xpath" style="font-size:25px">/child::span[1]/child::a[1]/child::span[1]/child::text()[1]</p>
    </div>

    <div class="step whiteoverlay" id="xpathstepdetail" data-x="3700" data-y="6800" data-z="200">
        <span class="title">In detail...<small>XPath step</small></span>
        <p class="xpath">axis::nodetest[predicate]</p>
        <ul>
            <li>The axis is a first selection:</li>
            <ul>
                <li>All the children (<span class="xpath">child</span>), the parent (<span class="xpath">parent</span>), itself (<span class="xpath">self</span>) ...</li>
            </ul>
            <li>The nodetest is a second selection:</li>
            <ul>
                <li>Only span elements: <span class="xpath">span</span></li>
                <li>All the elements of the axis: <span class="xpath">*</span></li>
                <li>Only text elements: <span class="xpath">text()</span></li>
            </ul>
            <li>The predicate is a last filter</li>
            <ul>
                <li>Can be very complex</li>
                <li>The most simple, a single number: <span class="xpath">div[1]</span></li>
            </ul>
        </ul>
    </div>

    <div class="step" id="firstproblemgettingthere" data-x="5000" data-y="6800">
        <script type="text/javascript">writeTitle('Problem 1', 'Defined at last');</script>
        <div id="sleeping" class="pixeldiv"></div>
        <ul>
            <li>Don’t worry, we’re getting there.</li>
            <li>Remember the examples we have to give to the system?</li>
        </ul>

    </div>

    <div class="step" id="pointingstep" data-x="5000" data-y="7400">
        <p class="textright">We can now <i>point</i> to the user's examples with <i class="textlarge">XPaths</i>!</p>
        <div id="pointingfinger"><i class="fa fa-hand-o-down" aria-hidden="true"></i></div>
        <div id="lookatdomimg2" class="pixeldiv"></div>
        <div class="examplecontainer">
            <div id="jeffexample" class="starhighlight"> </div>
            <div id="johnexample" class="starhighlight"> </div>
        </div>
    </div>

    <div class="step" id="xpathspointingstep" data-x="5200" data-y="7600" data-scale="1.5">
        <div class="xpath whiteoverlay one">/div[1]/span[1]/a[1]/span[1]/text()</div>
        <div class="xpath whiteoverlay two">/div[1]/span[2]/a[1]/span[1]/text()</div>
        <div id="leftfinger1"><i class="fa fa-hand-o-left" aria-hidden="true"></i></div>
        <div id="leftfinger2"><i class="fa fa-hand-o-left" aria-hidden="true"></i></div>
    </div>

    <div class="step" id="generalisingxpaths" data-x="5000" data-y="8400">
        <script type="text/javascript">writeTitle('Problem 1', 'Generalising XPaths');</script>
        <p>Considering <i>n</i> XPaths, create a single one that does the same: a <i>generalised XPath</i></p>
        <ul>
            <li>Remove anything that is not similar!</li>
            <ul>
                <li class="xpath">/div[1]/span<span class="red">[1]</span>/a[1]/span[1]/text()[1]</li>
                <li class="xpath">/div[1]/span<span class="red">[2]</span>/a[1]/span[1]/text()[1]</li>
            </ul>
            <li>This leaves us with an XPath that selects all actors</li>
            <ul>
                <li class="xpath">/div[1]/<span class="green">span</span>/a[1]/span[1]/text()[1]</li>
            </ul>
        </ul>
    </div>

    <div class="step" id="okaystep" data-x="5000" data-y="9000">
        <div id="okay">Okay.</div>
        <p>It's not that simple.</p>
        <ul>
            <li>The structure of documents is not always <i>that</i> similar</li>
            <li>We need a nice way of making generalisation <i>always</i> possible</li>
        </ul>
    </div>

    <div class="step" id="introducingeditdistances" menutext="Editdist" data-x="6000" data-y="9000">
        <script type="text/javascript">writeTitle('Introducing', 'Edit distances');</script>
        <p>Finding and exploiting similarities has been done with strings</p>
        <ul>
            <li>A string is a list of characters, e.g., "thedude"</li>
            <li>Edit distance between two strings <i>a</i> and <i>b</i></li>
            <li>= the minimal amount of edit operations to transform <i>a</i> into <i>b</i></li>
            <ul>
                <li><span class="green">Add</span> character</li>
                <li><span class="red">Remove</span> character</li>
                <li><span class="blue">Replace</span> character</li>
            </ul>
        </ul>

    </div>

    <div class="step" id="editdistanceexample" data-x="6000" data-y="9460">
        <p>For example, "<b>T<span class="red">h</span>ed<span class="red">u</span>d<span class="blue">e</span></b>" can be transformed in "<b>Tedd<span class="blue">y</span></b>" by:</p>
        <ul>
            <li><span>Removing</span> the <i class="red">h</i></li>
            <li><span>Removing</span> the <i class="red">u</i></li>
            <li><span>Replacing</span> <i class="blue">e</i> by <i class="blue">y</i></li>
        </ul>
        <p>Leading to an edit distance of 3</p>
    </div>

    <div class="step" id="examplealignmentstep" data-x="5900" data-y="9580" data-z="300">
        <p class="firstp">Using the edit distance, an alignment can be calculated!</p>
        <div id="examplealignment">
            <p>Thedude</p>
            <p>T ed dy</p>
        </div>
        <p>This is also possible for multiple strings</p>
    </div>

    <div class="step" id="usexpathssteps" data-x="6740" data-y="9620" data-z="300" data-rotate-y="-40">
        <p>We can use this alignment for our XPaths!</p>
        <p>Instead of <span class="characters"><span class="c">c</span><span class="c">h</span><span class="c">a</span><span class="c">r</span><span class="c">a</span><span class="c">c</span><span class="c">t</span><span class="c">e</span><span class="c">r</span><span class="c">s</span></span> in strings...</p>
        <p>     ...we use the XPath <span class="xpath"><span>/</span>s<span>/</span>t<span>/</span>e<span>/</span>p<span>/</span>s</span> in the XPaths.</p>
    </div>

    <div class="step" id="problem1step" data-x="6300" data-y="10800">
        <span class="title">Problem 1<small><i class="fa fa-chevron-right" aria-hidden="true"></i> Generalise user's examples</small></span>
        <p>Now it's for real, these are some user XPaths!</p>
        <ul>
            <li class="xpath">/html[1]/body[1]/div[2]/div[1]/table[1]/tr[1]/td[1]</li>
            <li class="xpath">/html[1]/body[1]/div[2]/div[2]/table[1]/tr[3]/td[1]</li>
            <li class="xpath">/html[1]/div[2]/div[2]/table[1]/tr[3]/td[1]</li>
        </ul>
        <p>A possible alignment:</p>
        <ul>
            <li class="xpath"><span class="xstep1 xstep">/html[1]</span>/<span class="xstep2 xstep blue">body[1]</span>/<span class="xstep3 xstep">div[2]</span>/<span class="xstep4 xstep green">div[1]</span>/<span class="xstep5 xstep">table[1]</span>/<span class="xstep6 xstep red">tr[1]</span>/<span class="xstep7 xstep">td[1]</span></li>
            <li class="xpath"><span class="xstep1 xstep">/html[1]</span>/<span class="xstep2 xstep blue">body[1]</span>/<span class="xstep3 xstep">div[2]</span>/<span class="xstep4 xstep green">div[2]</span>/<span class="xstep5 xstep">table[1]</span>/<span class="xstep6 xstep red">tr[3]</span>/<span class="xstep7 xstep">td[1]</span></li>
            <li class="xpath"><span class="xstep1 xstep">/html[1]</span>/<span class="xstep2 xstep blue">.&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;</span>/<span class="xstep3 xstep">div[2]</span>/<span class="xstep4 xstep green">div[2]</span>/<span class="xstep5 xstep">table[1]</span>/<span class="xstep6 xstep red">tr[3]</span>/<span class="xstep7 xstep">td[1]</span></li>
        </ul>
    </div>

    <div class="step whiteoverlay" id="problem1stepplusone" data-x="6300" data-y="10710" data-z="100">
        <p><i>Always the same amount of steps for each!</i></p>
        <p>Merge the steps...</p>
        <p>...and create the generalised XPath!</p>
    </div>

    <div class="step" id="resultgeneralisedxpath" data-x="6150" data-y="11200" data-z="300">
        <p>The result: <span class="xpath">/html[1]<span class="blue">//</span>div[2]/<span class="green">div</span>/table[1]/<span class="red">tr</span>/td[1]</span></p>
        <ul>
            <li>A single generalised XPath</li>
            <li>That retrieves all the original examples</li>
            <li>Possibly even more?</li>
        </ul>
    </div>

    <div class="step" data-x="6150" data-y="12000" data-z="300">
        <span class="title">Solution 1<small><i class="fa fa-chevron-right" aria-hidden="true"></i> Generalised XPath</small></span>
        <p>The generalised XPath solves the first problem:</p>
        <p><i>Given some user examples, find <i>all</i> the relevant items.</i></p>
        <p><b>Solution:</b></p>
        <ul>
            <li>Convert all user examples to XPaths</li>
            <li>Align and merge the examples, create generalised XPath</li>
            <li>Execute the generalised XPath on web pages</li>
        </ul>
    </div>

    <div class="step" id="problem2step" menutext="Prob2" data-x="7500" data-y="12000">
        <span class="title">Problem 2<small><i class="fa fa-chevron-right" aria-hidden="true"></i> Improve quality</small></span>
        <p>Generalised XPaths work <span class="positivefont fontlarger">well</span> for:</p>
        <ul>
            <li>Documents with a lot of structure</li>
            <li>Uniquely identifiable structures</li>
            <li>Not much replication</li>
        </ul>
        <p><span class="problemfont fontlarger">Problematic situations</span> exist where structure alone is <span class="emph">not enough</span></p>
    </div>

    <div class="step" data-x="7530" data-y="12580" data-z="400">
        <p class="positivefont fontlarger">For example!</p>
        <p><i class="fa fa-hand-o-right" aria-hidden="true"></i> Imagine two web pages with a similar structure</p>
        <!--<div id="imaginefiller"></div>-->
        <div id="problempage1" class="pixeldiv"></div>
        <!--<div id="problempage2" class="pixeldiv"></div>-->
    </div>

    <div class="step" id="disasterstep" data-x="7230" data-y="12800" data-z="450">
        <div class="textbox">
            <p class="positivefont fontlarger">But, <span class="problemfont">disaster...</span></p>
            <p><i>someone</i> made a mistake.</p>
            <p id="probablyfrank" class="onlyshowhere"><i>Probably</i> Frank</p class=">">
            <span id="disasterhand1" class="onlyshowhere">3<sup>rd</sup> row <i class="fa fa-hand-o-right fa-lg" aria-hidden="true"></i></span>
            <span id="disasterhand2" class="onlyshowhere"><i class="fa fa-hand-o-left fa-lg" aria-hidden="true"></i> 2<sup>nd</sup> row</span>
        </div>
        <!--<div id="imaginefiller"></div>-->
        <!--<div id="problempage1" class="pixeldiv"></div>-->
        <div id="problempage2" class="pixeldiv"></div>
    </div>

    <div class="step" id="toogeneral" data-x="7230" data-y="13800">
        <div class="textbox">
            <p>Based on these examples...</p>
        </div>
        <div class="imgbox">
            <img id="generalimg1" src="img/thebiglebowskisnippet.png"></img>
            <img id="generalimg2" src="img/drstrangelovesnippet.png"></img>
        </div>
        <div id="generalexample1" class="generalexample"></div>
        <div id="generalexample2" class="generalexample"></div>
        <div id="generalexample3" class="generalexample"></div>

        <div id="generalexample4" class="toomuchexample"></div>
        <div id="generalexample5" class="toomuchexample"></div>
        <div id="generalexample6" class="toomuchexample"></div>
        <div id="generalexample7" class="toomuchexample"></div>
        <div id="generalexample8" class="toomuchexample"></div>
        <div id="generalexample9" class="toomuchexample"></div>
        <div id="generalexample10" class="toomuchexample"></div>
    </div>

    <div class="step" id="toogeneral2" data-x="7350" data-y="14100" data-z="-50">
        <img id="thegeneralimg" src="img/generalturgidson.jpg"></img>
        <div class="textbox">
            <p>The generalised XPath</p>
            <p>will be too <i>general</i>.</p>
            </p>
        </div>
        <!--<div class="thegeneral">
        </div>-->
    </div>

    <div class="step" id="solution2step" data-x="7350" data-y="15000">
        <span class="title">Solution 2<small><i class="fa fa-chevron-right" aria-hidden="true"></i> Exploit context!</small></span>
        <ul>
            <li>Exploit more than just <i>structure</i></li>
            <li>Look around for helpful evidence or context</li>
        </ul>
        <div class="imgcontainer">
                <img src="img/thebiglebowskisnippet.png"></img>
        </div>
    </div>

    <div class="step" id="zoomthatmother" data-x="7300" data-y="15127" data-scale="0.4">
        <div style="text-align:center;">
        <span><i class="fa fa-search fa-5x" aria-hidden="true"></i></span>
        <ul class="textblock">
            <li>Exploit text</li>
            <li>Exploit styling</li>
            <li>Exploit common ancestors</li>
        </ul>
        </div>
    </div>

    <div class="step" id="introducingadvancedxpaths" menutext="Predicates" data-x="9000" data-y="15000">
        <span class="title">Introducing<small><i class="fa fa-chevron-right" aria-hidden="true"></i> Advanced predicates</small></span>
        <p>XPath predicates already helped us:</p>
        <ul>
            <li class="xpath">html[1]/body[1]/div[2]</li>
        </ul>
        We can make them more complex:
        <ul>
            <li><span  class="xpath">div[child::a]</span> → select a div that has an a child</li>
            <li><span class="xpath">div[preceding-sibling::table]</span></li>
            <li><span class="xpath">div[preceding-sibling::text()=”Stars”]</span></li>
        </ul>
    </div>

    <div class="step" id="predicateenrichment" data-x="9000" data-y="16500">
        <span class="title">Solution 2<small><i class="fa fa-chevron-right" aria-hidden="true"></i> Predicate enrichment!</small></span>
        <p>Add predicates to generalised XPath</p>
        <p>to make it more</p>
        <p><span class="narrow">narrow</span></p>
        <div>
        <div class="imgcontainer">
                <img src="img/filter_data (1).png"></img>
        </div>
        <ul>
            <li>Not allowing undesired results ...</li>
            <li>... increases data extraction precision</li>
        </ul>
        </div>
    </div>

    <div class="step" id="predicateenrichment2" data-x="9000" data-y="16900">
        <ul>
            <li>In this work, 6 types of predicates are proposed</li>
            <li>They are automatically added</li>
            <li>A preparation step is needed</li>
             <ul>
                <li>Build set of <span class="emph">indicated</span> nodes</li>
                <li>Build set of <span class="emph">overflow</span> nodes</li>
            </ul>
        </ul>
    </div>

    <div class="step" data-x="9000" id="boldstep" menutext="tree" data-y="17250">
        <span class="title">Predicate enrichment<small><i class="fa fa-chevron-right" aria-hidden="true"></i> Preparation</small></span>
        <div id="boldstatementsnippet">
            <pre>
&lt;body&gt;
    &lt;div&gt;
        &lt;span&gt;
            <span class="bold">&lt;b&gt;</span>Some <span class="bold">bold</span> statement<span class="bold">&lt;/b&gt;</span>
        &lt;/span&gt;
    &lt;/div&gt;
&lt;/body&gt;
            </pre>
            <img src="img/vincent.jpg"></img>
        </div>
    </div>

    <div class="step" id="treestep1" data-x="9100"  data-y="17400" data-z="-200" data-scale="0.9">
        <div class="tree" id="boldtree">

            <ul>
                <li>
                    <a class="rootnode">root</a>
                    <ul>
                        <li>
                            <a>body</a>
                            <ul>
                                <li>
                                    <a>div</a>
                                    <ul>
                                        <li>
                                            <a>span</a>
                                            <ul>
                                                <li>
                                                    <a class="nudgeright circle correspondnode">b</a>
                                                </li>
                                            </ul>
                                        </li>
                                    </ul>
                                </li>
                                <li>
                                    <a>div</a>
                                    <ul>
                                        <li>
                                            <a>div</a>
                                            <ul>
                                                <li>
                                                    <a>span</a>
                                                    <ul>
                                                        <li>
                                                            <a class="nudgeright circle">b</a>
                                                        </li>
                                                    </ul>
                                                </li>
                                            </ul>
                                        </li>
                                    </ul>
                                </li>
                                <li>
                                    <a>div</a>
                                    <ul>
                                        <li>
                                            <a>table</a>
                                            <ul>
                                                <li>
                                                    <a>tr</a>
                                                    <ul>
                                                        <li>
                                                            <a>td</a>
                                                            <ul>
                                                                <li>
                                                                    <a>span</a>
                                                                    <ul>
                                                                        <li>
                                                                            <a class="nudgeright">b</a>
                                                                        </li>
                                                                    </ul>
                                                                </li>
                                                            </ul>
                                                        </li>
                                                        <li>
                                                            <a>td</a>
                                                        </li>
                                                    </ul>
                                                </li>
                                            </ul>
                                        </li>
                                    </ul>
                                </li>
                            </ul>
                        </li>
                    </ul>
                </li>
            </ul>
        </div>
    </div>

    <div class="step" id="treestep2" data-x="9970"  data-y="17400" data-z="-500" data-scale="0.9">
        <div class="tree" id="boldtree2">

            <ul>
                <li>
                    <a class="rootnode">root</a>
                    <ul>
                        <li>
                            <a class="bodynode indicatednode">body</a>
                            <ul>
                                <li>
                                    <a class="divnode indicatednode divgrandparent">div</a>
                                    <ul>
                                        <li>
                                            <a class="spannode indicatednode spanparent">span</a>
                                            <ul>
                                                <li>
                                                    <a class="bnode nudgeright circle correspondnode desirednode">b</a>
                                                </li>
                                            </ul>
                                        </li>
                                    </ul>
                                </li>
                                <li>
                                    <a class="divnode indicatednode">div</a>
                                    <ul>
                                        <li>
                                            <a class="divgrandparent">div</a>
                                            <ul>
                                                <li>
                                                    <a class="spannode indicatednode spanparent">span</a>
                                                    <ul>
                                                        <li>
                                                            <a class="bnode nudgeright circle desirednode">b</a>
                                                        </li>
                                                    </ul>
                                                </li>
                                            </ul>
                                        </li>
                                    </ul>
                                </li>
                                <li>
                                    <a class="divnode overflownode">div</a>
                                    <ul>
                                        <li>
                                            <a>table</a>
                                            <ul>
                                                <li>
                                                    <a>tr</a>
                                                    <ul>
                                                        <li>
                                                            <a class="notmatchingspanrulenode">td</a>
                                                            <ul>
                                                                <li>
                                                                    <a class="spannode overflownode">span</a>
                                                                    <ul>
                                                                        <li>
                                                                            <a class="bnode nudgeright undesirednode">b</a>
                                                                        </li>
                                                                    </ul>
                                                                </li>
                                                            </ul>
                                                        </li>
                                                        <li>
                                                            <a>td</a>
                                                        </li>
                                                    </ul>
                                                </li>
                                            </ul>
                                        </li>
                                    </ul>
                                </li>
                            </ul>
                        </li>
                    </ul>
                </li>
            </ul>
        </div>
        <div class="textbox">
            The <span class="userexampletext">circles</span> are user examples.
        </div>
    </div>

     <div class="step treesubstep" id="treesubstep1" data-x="9950"  data-y="17400" data-rotate-y="-45">
         <div class="textbox">
             <p>Using the examples, build this generalised XPath</p>
             <span class="xpath">/body[1]/div//span[1]/b[1]</span>
             <p>...which results in one <span class="undesirednodetext">undesired</span> node</p>
         </div>
     </div>

     <div class="step treesubstep" id="treesubstep2" data-x="9950"  data-y="17400" data-rotate-y="-45">
         <div class="textbox">
             <span class="xpath">/<span class="currentxpathstep">body[1]</span>/div//span[1]/b[1]</span>
         </div>
     </div>

     <div class="step treesubstep" id="treesubstep3" data-x="9950"  data-y="17400" data-rotate-y="-45">
         <div class="textbox">
             <span class="xpath">/body[1]/<span class="currentxpathstep">div</span>//span[1]/b[1]</span>
         </div>
     </div>

    <div class="step treesubstep" id="treesubstep4" data-x="9950"  data-y="17400" data-rotate-y="-45">
         <div class="textbox">
             <span class="xpath">/body[1]/div//<span class="currentxpathstep">span[1]</span>/b[1]</span>
         </div>
     </div>

     <div class="step treesubstep" id="treesubstep5" data-x="9950"  data-y="17400" data-rotate-y="-45">
         <div class="textbox">
             <span class="xpath">/body[1]/div//span[1]/<span class="currentxpathstep">b[1]</span></span>
         </div>
     </div>

    <div class="step treesubstep" id="treesubstep6" data-x="9990"  data-y="17400" data-z="-100" data-rotate-y="-45">
         <div class="textbox">
             <p>The nodes that we travelled through on the way to an <span class="desired">example</span> are <span class="indicated">indicated nodes</span>, others are <span class="overflow">overflow nodes</span ></p>
             <span class="xpath">/body[1]/div//span[1]/b[1]</span>
         </div>
     </div>

    <div class="step" id="solution2final" data-x="10000"  data-y="18500">
        <span class="title">Solution 2<small><i class="fa fa-chevron-right" aria-hidden="true"></i> Predicate enrichment</small></span>
        <ul>
            <li>The generalised XPath targets too much</li>
            <li>Use predicates to narrow the scope</li>
            <li>Find out a suitable predicate</li>
            <ul>
                <li>Automatically</li>
                <li>Based on rules and context</li>
                <li>Not too restrictive</li>
                <li>Should still be readable</li>
            </ul>
        </ul>
     </div>

    <div class="step treesubstep" id="treesubstep7" data-x="10000"  data-y="17400" data-rotate-y="-40">
         <div class="textbox">
             <span class="title"><small>Example</small></span>
             <!--<p>Example!</p>-->
             <ul>
                 <li>The <span class="desirednodetext">desired</span> <b>b</b> tags all have a <b class="spanparenttext">span</b> parent and <b class="divgrandparenttext">div</b> grandparent</li>
                 <li>This can be added as a predicate to the <b>span</b> tags</li>
                 <ul>
                     <li>The indicated nodes fulfill it</li>
                     <li>... and the overflow nodes <span class="notmatchingspanrule">don't</span></li>
                 </ul>
                 <li>Result: <span class="xpath">/body[1]/div//span[1]<span class="spanparenttext">[parent::div]</span>/b[1]</li>
             </ul>
         </div>
     </div>


    <div class="step" id="conclusion2" data-x="12000"  data-y="17400">
        <span class="title">Solution 2<small><i class="fa fa-chevron-right" aria-hidden="true"></i> Predicate enrichment</small></span>
        <p>In this work, 6 predicates are proposed</p>
        <ul>
            <li>Each fulfills a specific purpose</li>
            <li>Easily expandable</li>
        </ul>
        <p>Tests indicate increased precision</p>
        <ul>
            <li>Less undesired elements are retrieved</li>
            <li>Positive impact on overall quality</li>
        </ul>
     </div>

    <div class="step" id="problem3intro" menutext="Prob3" data-x="14000"  data-y="17400">
        <script type="text/javascript">writeTitle('Third problem', 'Automation');</script>
        <p>Can everything up until now be somehow <span class="automatedword">automated</span>?</p>
        <img id="mrhandyimg" src="img/mrhandybig.png"></img>
     </div>

     <div class="step" id="problem3intro2" data-x="14680"  data-y="17850" data-scale="0.8" data-rotate-z="-5">
        <p>Consider this question in the context of</p><p><span class="emph fontlarger">set expansion</span></p>
        <ul>
            <li>Begin with a set of elements</li>
            <li>Automatically expand that set with relevant elements</li>
        </ul>
     </div>

     <div class="step" id="problem3intro3" data-x="14240"  data-y="18700" data-scale="0.8">
        <script type="text/javascript">writeTitle('For example', '');</script>
        <p>Let's say we already know a set of movie titles</p>
        <div id="movietitleswapper">
            <div class="movietitlewrapper" id="movietitle1"><img class="movietitleimg" src="img/biglebowskititle.jpg"/></div>
            <div class="movietitlewrapper" id="movietitle2"><img class="movietitleimg" src="img/fourroomstitle.jpg"/></div>
            <div class="movietitlewrapper" id="movietitle3"><img class="movietitleimg" src="img/strangelovetitle.jpg"/></div>
            <div class="movietitlewrapper" id="movietitle4"><img class="movietitleimg" src="img/pulpfictiontitle.jpg"/></div>
            <div class="movietitlewrapper" id="movietitle5"><img class="movietitleimg" src="img/pickofdestinytitle.jpg"/></div>
        </div>
     </div>

     <div class="step" id="problem3intro4" data-x="14240"  data-y="19400">
        <div class="card">
            <div class="cardtitle">Question</div>
            <div class="cardbody">If we get a list of 1000 IMDB web pages, can we automatically find new movie titles to extend our set?</div>
        </div>
        <div id="imdbpageswapper">
            <img id="imdbpage1" class="boxshadow imdbpageimg" src="img/imdbpages/fourrooms.png"/>
            <img id="imdbpage2" class="boxshadow imdbpageimg" src="img/imdbpages/pod.png"/>
            <img id="imdbpage3" class="boxshadow imdbpageimg" src="img/imdbpages/thebiglebowski.png"/>
            <img id="imdbpage4" class="boxshadow imdbpageimg" src="img/imdbpages/snatch.png"/>
            <img id="imdbpage5" class="boxshadow imdbpageimg" src="img/imdbpages/drstrangelove.png"/>
        </div>
     </div>

     <div class="step" id="problem3strategy" data-x="14240"  data-y="19600" data-scale="0.8">
        <div class="card onlyshowhere">
            <div class="cardtitle">Strategy</div>
            <div class="cardbody">
                <ul>
                    <li>Look for the titles we know</li>
                    <li>Figure out rules for them</li>
                    <li>Generalise XPaths of the found titles</li>
                </ul>
            </div>
        </div>
     </div>

    <div class="step" id="problem3matches" data-x="14240"  data-y="19500" data-scale="0.6">
        <div class="card onlyshowhere">
            <div class="cardtitle">Matches</div>
            <div class="cardbody">
                Titles are found in obvious places...
            </div>
        </div>
     </div>

     <div class="step" id="problem3matches2" data-x="14240"  data-y="20000" data-scale="0.6">
        <div class="card onlyshowhere">
            <div class="cardtitle">Matches</div>
            <div class="cardbody">
                ...and less obvious places.
            </div>
        </div>
     </div>

    <div class="step" id="problem3matches3" data-x="14240"  data-y="20000" data-scale="1">
        <div class="card onlyshowhere">
            <div class="cardtitle">Found terms?</div>
            <div class="cardbody">
                <ul>
                    <li>For each match, generate its XPath</li>
                    <li>If there are multiple on a page</li>
                    <ul>
                        <li>Cluster the XPaths</li>
                        <li>Similar XPaths get grouped together</li>
                    </ul>
                    <li>Each cluster results in generalised XPath</li>
                </ul>
            </div>
        </div>
     </div>

     <div class="step" id="solution3" data-x="16000" menutext="Cluster"  data-y="20000" data-z="-300" data-rotate-y="20">
        <script type="text/javascript">writeTitle('Solution 3', 'Set expansion');</script>
        <p>Assume that known movie titles are found multiple times on the known web pages</p>
        <ul>
            <li class="xpath">/body[1]/html[1]/div[2]/div[4]/h1[1]</li>
            <li class="xpath">/body[1]/html[1]/head[1]/title[1]</li>
            <li class="xpath">/body[1]/html[1]/div[2]/div[4]/h1[1]</li>
            <li class="xpath">/body[1]/html[1]/div[2]/div[4]/div[1]/div[1]/a[1]/b[1]</li>
            <li class="xpath">/body[1]/html[1]/div[2]/div[3]/h1[1]</li>
            <li class="xpath">/body[1]/html[1]/div[1]/div[3]/div[1]/div[1]/a[1]/b[1]</li>
        </ul>
     </div>

     <div class="step" id="docluster" data-x="16000"  data-y="19900" data-z="0" data-rotate-y="0">
        <div class="card onlyshowhere">
            <div class="cardtitle">Cluster!</div>
            <!--<div class="cardbody">
                Cluster!
            </div>-->
        </div>
     </div>

    <div class="step" id="continuecluster" data-x="15500"  data-y="20100" data-z="-300">
        <div id="curlybracket">{</div>
        <div class="card onlyshowhere">
            <div class="cardtitle">One cluster</div>
            <div class="cardbody">
                <ul>
                    <li>Has similar XPaths</li>
                    <li>They can be generalised!</li>
                </ul>
            </div>
        </div>
     </div>

     <div class="step" id="mergeresult" data-x="15500"  data-y="21100" data-z="-300">
        <script type="text/javascript">writeTitle('Solution 3', 'Set expansion');</script>
        <p>A cluster is merged...</p>
        <ul>
            <li class="xpath">/body[1]/html[1]/div[2]/<span class="xpathhighlight">div[4]</span>/h1[1]</li>
            <li class="xpath">/body[1]/html[1]/div[2]/<span class="xpathhighlight">div[4]</span>/h1[1]</li>
            <li class="xpath">/body[1]/html[1]/div[2]/<span class="xpathhighlight">div[3]</span>/h1[1]</li>
        </ul>
        <p>...into a generalised XPath</p>
        <ul>
            <li class="xpath">/body[1]/html[1]/div[2]/<span class="xpathhighlight">div</span>/h1[1]</li>
        </ul>
     </div>

     <div class="step" id="mergeresult2" data-x="15500"  data-y="21800" data-z="-200">
        <p>Repeat execution of generalised XPath on:</p>
        <ul>
            <li>The known web pages</li>
            <ul>
                <li>Could contain lists of movie titles?</li>
                <li>Maybe multiple pages contain a single title on the same place?</li>
            </ul>
            <li>Web page on which nothing was found yet</li>
            <ul>
                <li>Maybe we can find <i>new</i> items?</li>
                <li>This expands the original set!</li>
            </ul>
        </ul>
     </div>

     <div class="step" id="conclusionstep" menutext="Conclusions" data-x="15500"  data-y="23000">
        <script type="text/javascript">writeTitle('Conclusion', '');</script>
        <p><b>Main focus</b>: Data extraction from (semi-)structured documents.</p>
        <p>The following problems were investigated:</p>
    </div>

     <div class="step conclusionstep" id="conclusionstep1" data-x="15500"  data-y="23300" data-z="200">
         <div class="card">
            <div class="cardtitle">Problem 1</div>
            <div class="cardbody">
                <div class="problem">
                    <div class="problemheader">
                        <i class="fa fa-fw fa-flask" aria-hidden="true"></i>
                    </div>
                    <div class="problemtext">
                        Can a data extractor method be constructed using user examples?
                    </div>
                </div>
                <div class="solution">
                    <div class="solutionheader">
                        <i class="fa fa-fw fa-lightbulb-o" aria-hidden="true"></i>
                    </div>
                    <div class="solutiontext">Generalised XPaths</div>
                </div>
            </div>
        </div>
     </div>

     <div class="step conclusionstep" id="conclusionstep2" data-x="15500"  data-y="23600" data-z="400">
         <div class="card">
            <div class="cardtitle">Problem 2</div>
            <div class="cardbody">
                <div class="problem">
                    <div class="problemheader">
                        <i class="fa fa-fw fa-flask" aria-hidden="true"></i>
                    </div>
                    <div class="problemtext">
                        Can context be used to increase precision?
                    </div>
                </div>
                <div class="solution">
                    <div class="solutionheader">
                        <i class="fa fa-fw fa-lightbulb-o" aria-hidden="true"></i>
                    </div>
                    <div class="solutiontext">Predicate enrichment of generalised XPaths</div>
                </div>
            </div>
        </div>
     </div>

     <div class="step conclusionstep" id="conclusionstep3" data-x="15500"  data-y="23900" data-z="600">
         <div class="card">
            <div class="cardtitle">Problem 3</div>
            <div class="cardbody">
                <div class="problem">
                    <div class="problemheader">
                        <i class="fa fa-fw fa-flask" aria-hidden="true"></i>
                    </div>
                    <div class="problemtext">
                        Can set expansion be performed with enriched XPaths?
                    </div>
                </div>
                <div class="solution">
                    <div class="solutionheader">
                        <i class="fa fa-fw fa-lightbulb-o" aria-hidden="true"></i>
                    </div>
                    <div class="solutiontext">Automated lookups were investigated using enriched XPaths</div>
                </div>
            </div>
        </div>
     </div>

    <div class="step" id="laststep" data-x="15500" menutext="Thanks" data-y="23750" data-z="0" data-scale="4" data-rotate="170">
         <p>Thank you for your attention</p>
         <p>Any question is well appreciated!</p>
         <img id="questionmarkimg" src="img/Low-Poly-Shattered-Question-Mark-2400px.png"/>
     </div>


</div>

<script src="js/impress.js"></script>
<script>impress().init();</script>

</body>
</html>