<?xml version="1.0" encoding="UTF-8"?>
<TEI xmlns="http://www.tei-c.org/ns/1.0" xml:lang="en">
    <teiHeader>
        <fileDesc>
            <titleStmt>
                <title>A Mixed-principle Rule-based Approach to the Automatic Syllabification of
                    Serbian</title>
                <author>
                    <name>
                        <forename>Aniko</forename>
                        <surname>Kovač</surname>
                        <affiliation>Department of Language Science and Technology</affiliation>
                        <address>
                            <addrLine>Saarland University Campus A2 2</addrLine>
                            <addrLine>66123 Saarbrücken, Germany</addrLine>
                        </address>
                        <email>anikok@coli.uni-saarland.de</email>
                    </name>
                </author>
                <author>
                    <name>
                        <forename>Maja</forename>
                        <surname>Marković</surname>
                        <affiliation>Department of English Language and Literature</affiliation>
                        <address>
                            <addrLine>Faculty of Philosophy, University of Novi Sad Dr Zorana
                                Đinđića 2</addrLine>
                            <addrLine>21000 Novi Sad, Serbia</addrLine>
                        </address>
                        <email>majamarkovic@ff.uns.ac.rs</email>
                    </name>
                </author>
            </titleStmt>
            <editionStmt>
                <edition><date>2019-03-19</date></edition>
            </editionStmt>
            <publicationStmt>
                <publisher>
                    <orgName xml:lang="sl">Inštitut za novejšo zgodovino</orgName>
                    <orgName xml:lang="en">Institute of Contemporary History</orgName>
                    <address>
                        <addrLine>Kongresni trg 1</addrLine>
                        <addrLine>SI-1000 Ljubljana</addrLine>
                    </address>
                </publisher>
                <pubPlace>http://ojs.inz.si/pnz/article/view/332</pubPlace>
                <date>2019</date>
                <availability status="free">
                    <licence>http://creativecommons.org/licenses/by-nc-nd/4.0/</licence>
                </availability>
            </publicationStmt>
            <seriesStmt>
                <title xml:lang="sl">Prispevki za novejšo zgodovino</title>
                <title xml:lang="en">Contributions to Contemporary History</title>
                <biblScope unit="volume">59</biblScope>
                <biblScope unit="issue">1</biblScope>
                <idno type="ISSN">2463-7807</idno>
            </seriesStmt>
            <sourceDesc>
                <p>No source, born digital.</p>
            </sourceDesc>
        </fileDesc>
        <encodingDesc>
            <projectDesc xml:lang="en">
                <p>Contributions to Contemporary History is one of the central Slovenian scientific
                    historiographic journals, dedicated to publishing articles from the field of
                    contemporary history (the 19th and 20th century).</p>
                <p>The journal is published three times per year in Slovenian and in the following
                    foreign languages: English, German, Serbian, Croatian, Bosnian, Italian, Slovak
                    and Czech. The articles are all published with abstracts in English and
                    Slovenian as well as summaries in English.</p>
            </projectDesc>
            <projectDesc xml:lang="sl">
                <p>Prispevki za novejšo zgodovino je ena osrednjih slovenskih znanstvenih
                    zgodovinopisnih revij, ki objavlja teme s področja novejše zgodovine (19. in 20.
                    stoletje).</p>
                <p>Revija izide trikrat letno v slovenskem jeziku in v naslednjih tujih jezikih:
                    angleščina, nemščina, srbščina, hrvaščina, bosanščina, italijanščina, slovaščina
                    in češčina. Članki izhajajo z izvlečki v angleščini in slovenščini ter povzetki
                    v angleščini.</p>
            </projectDesc>
        </encodingDesc>
        <profileDesc>
            <langUsage>
                <language ident="sl"/>
                <language ident="en"/>
            </langUsage>
            <textClass>
                <keywords xml:lang="en">
                    <term>syllable</term>
                    <term>rule-based approach</term>
                    <term>sonority</term>
                    <term>computational linguistics</term>
                    <term>phonology</term>
                </keywords>
                <keywords xml:lang="sl">
                    <term>zlog</term>
                    <term>pristop na podlagi pravil</term>
                    <term>zvočnost</term>
                    <term>računalniško jezikoslovje</term>
                    <term>fonologija</term>
                </keywords>
            </textClass>
        </profileDesc>
        <revisionDesc>
            <listChange>
                <change>
                    <date>2019-06-03</date>
                    <name>Mihael Ojsteršek</name>
                    <desc>Pretvorba iz DOCX v TEI, dodatno kodiranje</desc>
                </change>
            </listChange>
        </revisionDesc>
    </teiHeader>
    <text>
        <front>
            <docAuthor>Aniko Kovač<note place="foot" xml:id="ftn1" n="*"><hi rend="bold">Department
                        of Language Science and Technology, Saarland University Campus A2 2, 66123
                        Saarbrücken, Germany, <ref target="mailto:anikok@coli.uni-saarland.de"
                            >anikok@coli.uni-saarland.de</ref></hi></note></docAuthor>
            <docAuthor>Maja Markovič<note place="foot" xml:id="ftn2" n="**"><hi rend="bold"
                        >Department of English Language and Literature, Faculty of Philosophy,
                        University of Novi Sad Dr Zorana Đinđića 2, 21000 Novi Sad, Serbia, <ref
                            target="mailto:majamarkovic@ff.uns.ac.rs"
                            >majamarkovic@ff.uns.ac.rs</ref></hi></note></docAuthor>
            <docImprint>
                <idno type="cobissType">Cobiss type: 1.01</idno>
                <idno type="UDC">UDC: 004.934:821.163.41</idno>
            </docImprint>
            <div type="abstract" xml:lang="sl">
                <head>IZVLEČEK</head>
                <head style="text-transform: uppercase;">Mešani pristop k avtomatskemu zlogovanju v srbščini na podlagi načel in
                    pravil</head>
                <p>
                    <hi rend="italic">V tem prispevku predstavljamo mešani pristop k avtomatskemu
                        zlogovanju v srbščini na podlagi načel in pravil, ki temelji na predpisnih
                        pravilih tradicionalne slovnice v kombinaciji z načelom zaporedja glede na
                        zvočnost (Sonority Sequencing Principle). Proučujemo težave in omejitve obeh
                        uveljavljenih pristopov, ki temeljita na zbirki pravil in zvočnosti;
                        vpeljujemo algoritem, ki uporablja oba načina za doseganje natančnejše
                        členitve besed na zloge, ki bi bila skladnejša z intuicijo rojenih govorcev;
                        in predstavljamo statistične podatke, povezane z razporeditvijo zlogov in
                        njihovo strukturo v srbščini.</hi>
                </p>
                <p>
                    <hi rend="italic">Ključne besede: zlog, pristop na podlagi pravil, zvočnost,
                        računalniško jezikoslovje, fonologija</hi>
                </p>
            </div>
            <div type="abstract">
                <head>ABSTRACT</head>
                <p>
                    <hi rend="italic">In this paper we present a mixed-principle rule-based approach
                        to the automatic syllabification of Serbian, based on prescriptive rules
                        from traditional grammar in combination with the Sonority Sequencing
                        Principle. We explore the problems and limitations of the existing rule set
                        and sonority-based approaches, introduce an algorithm that utilizes both
                        means in an attempt to produce a more accurate segmentation of words into
                        syllables that is better aligned with the intuition of the native speakers,
                        and present the statistical data related to the distribution of syllables
                        and their structure in Serbian.</hi>
                </p>
                <p>
                    <hi rend="italic">Keywords: syllable, rule-based approach, sonority,
                        computational linguistics, phonology</hi>
                </p>
            </div>
        </front>
        <body>
            <div>
                <head>Introduction</head>
                <p>Syllables have been considered — although not unequivocally (cf. <ref
                        target="#Koehler.1966">Koehler 1966</ref>) — to be one of the basic units in
                    phonology constituting the minimal units of pronunciation, and to play a role in
                    prosody, phonotactics, and phonological processing (<ref
                        target="#Ladefoged.2014">Ladefoged and Johnson 2014</ref>). The role of the
                    segmentation of words into syllables and their distributional properties began
                    to see an increase in importance in speech technologies in the 1990s (<ref
                        target="#Iacoponi.2011">Iacoponi and Savy 2011</ref>), most notably in the
                    areas of speech recognition (SR) and text-to-speech synthesis (TTS).</p>
                <p>Syllable segmentation today plays a role in speech technologies on the segmental
                    level — conditioning the length of segmental units such as consonants and vowels
                    — as well as on the prosodic level — governing rhythmical alternations <ref
                        target="#Bigi.2014">(Bigi and Petrone 2014)</ref>. Syllable segmentation is
                    also a key component in hyphenation (e.g. <ref target="#Kaplar.2018">Kaplar et
                        al. 2018</ref>), although it should be noted that, at least in Serbian,
                    hyphenation is governed by a partially diverging set of rules from those
                    governing syllabification<note place="foot" xml:id="ftn3" n="1">For example,
                        hyphenation rules ban the segmentation after a syllable consisting of a
                        single vowel at word onset, while this segmentation is allowed and expected
                        according to the rules of syllabification.</note>. Syllable distribution
                    data is also of crucial importance for psycholinguistic experiments, as syllable
                    frequency has been shown to play a role in the processing of words (e.g. <ref
                        target="#Barber.2004">Barber et al. 2004</ref>; <ref target="#Cholin.2006"
                        >Cholin et al. 2006</ref>; <ref target="#Cholin.2009">Cholin and Levelt
                        2009</ref>). Developing an automatic system of syllabification allows for
                    the segmentation of large-scale language corpora needed for the development of
                    automatic systems or the extraction of relevant data related to frequency
                    syllable distributions, which would otherwise require a large number of trained
                    annotators and would be a resource and cost heavy undertaking.</p>
                <p>The two generally distinguishable approaches to automatic syllabification are
                    rule-based versus data-driven approaches (<ref target="#Marchand.2009">Marchand
                        et al. 2009</ref>). While data-driven approaches have taken over many
                    aspects of natural language processing, and there are a number of data-driven
                    models of syllable segmentation using artificial neural networks (e.g. <ref
                        target="#Daelemans.1992">Daelemans and van den Bosch 1992</ref>; <ref
                        target="#Hunt.1993">Hunt 1993</ref>; <ref target="#Stoianov.1997">Stoianov
                        et al. 1997</ref>; <ref target="#Landsiedel.2011">Landsiedel et al.
                        2011</ref>), the unavailability of segmented data for Serbian makes
                    rule-based approaches the only viable option for automatic syllabification in
                    Serbian.</p>
                <p>To the best of our knowledge, there is a single publicly available attempt at
                    developing a rule-based syllabifier for Serbian by <ref target="#Kaplar.2018"
                        >Kaplar et al. (2018)</ref>. In this paper we lay out a number of problems
                    and limitations with the ruleset used in their syllabification system and why
                    relying on the existing set of prescriptive rule descriptions from traditional
                    grammar is insufficient to capture and describe a syllabification system that is
                    aligned with the intuition of native speakers of Serbian. A relatable attempt at
                    automatic syllabification was developed by <ref target="#Meštrović.2015"
                        >Meštrović et al. (2015)</ref> for Croatian, the key difference between
                    their work and ours being in the principle behind the syllabification algorithm
                    which in their case relied solely on the onset maximization principle — limiting
                    possible syllable onsets to valid onsets at the beginning of words. Taking into
                    account <ref target="#Morelli.1999">Morelli’s (1999)</ref> limitations on
                    possible syllable onsets in Serbo-Croatian, the onset maximization principle
                    employed by Meštrović et al. could be considered a comparatively liberal system.
                    In order to attempt to constrain our syllabifer, we are decided on a different
                    approach that will not rely on onset maximization, but rather a combination of a
                    number of alternative principles.</p>
                <p>In this paper we present a mixed-principle rule-based approach to the
                    syllabification of Serbian. Our starting set of rules is based on the <hi
                        rend="italic">Gramatika srpskoga jezika</hi> by <ref
                        target="#Stanojčić.2005">Stanojčić and Popović (2005)</ref>, a prescriptive
                    textbook for Serbian grammar that presents a set of rule descriptions for the
                    segmentation of words into syllables. In a previous version of our
                    syllabification algorithm (<ref target="#Kovač.2018">Kovač and Marković
                        2018</ref>), we made a number of changes to the rule descriptions of <ref
                        target="#Stanojčić.2005">Stanojčić and Popović (2005)</ref> as the
                    formulation of some of the descriptions proved to be redundant, some were
                    example-based and not specific enough for a formal implementation, and we also
                    expanded them with three added modifications related to the treatment of nasals
                    and the alveolar sonorant /<hi rend="italic">r</hi>/ based on <ref
                        target="#Kašić.2014">Kašić (2014)</ref> and the treatment of alveolar
                    sonorants /<hi rend="italic">l</hi>/ and /<hi rend="italic">n</hi>/ based on
                        <ref target="#Zec.2000">Zec (2000)</ref>. In this paper we extend our
                    previous algorithm to include a module for validating the structure of syllables
                    in terms of their compliance with the Sonority Sequencing Principle (SSP), thus
                    further fine-tuning the accuracy of our segmentation, and resolving a number of
                    problems noted in our earlier implementation.</p>
                <p>The goal of the paper is threefold: i) to improve our system for automatic
                    rule-based syllabification for Serbian based on the formalization of existing
                    rule descriptions by the addition of the sonority sequencing validation module,
                    ii) to provide an analysis of the outcomes of the automatic syllabification
                    process in order to address possible theoretical considerations and serve as a
                    basis for the development of future syllabifiers, and iii) to present
                    statistical data related to the distribution of syllables and their structure in
                    Serbian.</p>
            </div>
            <div>
                <head>Prescriptive Rule Descriptions</head>
                <p>Our starting set of rules was based on the formalization of the rule descriptions
                    governing the segmentation of words into syllables from the <hi rend="italic"
                        >Gramatika srpskoga jezika</hi> by <ref target="#Stanojčić.2005">Stanojčić
                        and Popović (2005)</ref>. Being a prescriptive textbook on Serbian grammar
                    used at a high school level by all student profiles, we expected these rules to
                    constitute the common knowledge base shared by the majority of native
                    speakers.</p>
                <p>Regarding syllable boundaries, <ref target="#Stanojčić.2005">Stanojčić and
                        Popović (2005, 37)</ref> establish the following general rule (1).</p>
                <list type="gloss">
                    <label>(1)</label>
                    <item><hi rend="italic">In words made up of multiple phonemes, consonants,
                            sonorants and vowels, the syllable boundary comes after the vowel and
                            before the consonant (e.g. </hi>či-ta-ti<hi rend="italic"> [</hi>to
                            read<hi rend="italic">]).</hi></item>
                </list>
                <p>In addition to this general rule, they list the following rules — (2), (3), (4),
                    (5) and (6) — that further specify medial syllable boundaries depending on
                    consonant manner of articulation.</p>
                <list type="gloss">
                    <label>(2)</label>
                    <item><hi rend="italic">Medially, in a consonant cluster which has an affricate
                            or fricative sound in its initial position, the syllable boundary will
                            be before that consonant cluster (e.g. </hi>po-šta <hi rend="italic"
                            >[</hi>post<hi rend="italic">], </hi>ma-čka <hi rend="italic"
                            >[</hi>cat<hi rend="italic">]).</hi></item>
                    <label>(3)</label>
                    <item><hi rend="italic">The syllable boundary will be before a consonant cluster
                            if, in a consonant cluster found medially in a word, the second position
                            in the cluster is occupied by one of the sonorants /</hi>v<hi
                            rend="italic">/, /</hi>j<hi rend="italic">/, /</hi>r<hi rend="italic">/,
                            /</hi>l<hi rend="italic">/ or /</hi>ʎ<hi rend="italic">/ preceded by any
                            other consonant besides a sonorant (e.g. </hi>sve-tlost <hi
                            rend="italic">[</hi>light<hi rend="italic">]).</hi></item>
                    <label>(4)</label>
                    <item><hi rend="italic">If a consonant cluster consists of two sonorants, the
                            syllable boundary will be between them so that one sonorant belongs to
                            the preceding, and one sonorant belongs to the following syllable (e.g.
                        </hi>lom-ljen <hi rend="italic">[</hi>broken<hi rend="italic"
                        >]).</hi></item>
                    <label>(5)</label>
                    <item><hi rend="italic">If a consonant cluster consists of a plosive in its
                            initial position and some other consonant except the sonorants
                            /</hi>j<hi rend="italic">/, /</hi>v<hi rend="italic">/, /</hi>l<hi
                            rend="italic">/, /</hi>ʎ<hi rend="italic">/ and /</hi>r<hi rend="italic"
                            >/, the syllable boundary will be between the consonants (e.g.
                        </hi>lep-tir <hi rend="italic">[</hi>butterfly<hi rend="italic"
                        >]).</hi></item>
                    <label>(6)</label>
                    <item><hi rend="italic">If in a cluster of two sonorants, the second position is
                            occupied by the sonorant /</hi>j<hi rend="italic">/ from je
                            corresponding to the ijekavica dialect to /</hi>e<hi rend="italic">/ in
                            the ekavica dialect, the syllable boundary will be before that group
                            (e.g. </hi>čo-vjek <hi rend="italic">[</hi>man<hi rend="italic"
                        >]).</hi></item>
                </list>
                <p><ref target="#Stanojčić.2005">Stanojčić and Popović (2005, 32)</ref> also
                    introduce the rule descriptions (7) and (8) to define when the sonorants /<hi
                        rend="italic">r</hi>/, /<hi rend="italic">l</hi>/, and /<hi rend="italic"
                        >n</hi>/ constitute syllable nuclei.</p>
                <list type="gloss">
                    <label>(7)</label>
                    <item><hi rend="italic">The sonorant /</hi>r<hi rend="italic">/ can be a
                            syllable carrier in standard Serbian when:</hi>
                        <list rend="ordered:a">
                            <item><hi rend="italic">it is found medially between two consonants
                                    (e.g. </hi>tr-ča-ti <hi rend="italic">[</hi>to run<hi
                                    rend="italic">]),</hi></item>
                            <item><hi rend="italic">it is found initially before a consonant (e.g.
                                </hi>r-va-ti se <hi rend="italic">[</hi>to wrestle<hi rend="italic"
                                    >]),</hi></item>
                            <item><hi rend="italic">it is found after a vowel in compounds (e.g.
                                    </hi>za-r-đa-ti<hi rend="italic"> [</hi>to rust<hi rend="italic"
                                    >]),</hi></item>
                            <item><hi rend="italic">before /</hi>o<hi rend="italic">/ that is
                                    realized as an /</hi>l<hi rend="italic">/ in other members of
                                    the paradigm (e.g. </hi>o-tr-o <hi rend="italic">(m.) from
                                </hi>o-tr-la <hi rend="italic">(f.) [</hi>wiped<hi rend="italic"
                                    >]).</hi></item>
                        </list>
                    </item>
                    <label>(8)</label>
                    <item><hi rend="italic">The other two alveolar sonorants, /</hi>l<hi
                            rend="italic">/ and /</hi>n<hi rend="italic">/ can be syllable carriers
                            in dialectal toponyms (e.g. </hi>Stlp<hi rend="italic">, </hi>Vlča
                            glava<hi rend="italic">, </hi>Žlne<hi rend="italic">) or foreign
                            toponyms (e.g. </hi>Vltava<hi rend="italic">, </hi>Plzen<hi
                            rend="italic">) but also in other personal names (e.g. English </hi>Idn
                            <hi rend="italic">or Arabic </hi>Ibn-Saud<hi rend="italic">), and in the
                            word </hi>bicikl <hi rend="italic">[</hi>bicycle<hi rend="italic"
                            >].</hi></item>
                </list>
            </div>
            <div>
                <head>Revising the Existing Rule Set</head>
                <p>The development of our syllabification algorithm has been an iterative process
                    testing the existing rule set and making changes as needed. While other authors
                    (e.g. <ref target="#Kaplar.2018">Kaplar et al. 2018</ref>) used the rule
                    descriptions of <ref target="#Stanojčić.2005">Stanojčić and Popović (2005)</ref>
                    directly to implement a software architecture for syllabification in Serbian, we
                    have found a number of problems with this approach.</p>
                <p>The definition of the rule description under (1) causes the initial member of a
                    consonant cluster in the rule descriptions under (2)–(6) to be understood as the
                    first consonant following a vowel. However, given that the sonorants /<hi
                        rend="italic">r</hi>/, /<hi rend="italic">l</hi>/, and /<hi rend="italic"
                        >n</hi>/ can also constitute syllable nuclei in Serbian in certain
                    positions, as presented under rule descriptions (7) and (8), a more precise
                    definition would be that the initial member of a consonant cluster is the first
                    consonant following an element that constitutes a syllable nucleus. The general
                    rule under (1) should be then revised as follows.</p>
                <list type="gloss">
                    <label>(1*)</label>
                    <item><hi rend="italic">In words made up of multiple phonemes, consonants,
                            sonorants and vowels, the syllable boundary comes after the vowel or
                            sonorants /</hi>r<hi rend="italic">/, /</hi>l<hi rend="italic">/, and
                            /n/ in syllable bearing positions and before the consonant (e.g.
                            </hi>či-ta-ti<hi rend="italic"> [</hi>to read<hi rend="italic">],
                            </hi>tr-ča-ti<hi rend="italic"> [</hi>to run<hi rend="italic"
                        >]).</hi></item>
                </list>
                <p>In addition to our expansion of the general rule presented under (1) to include
                    the syllable bearing sonorants, while formalizing the rule descriptions via
                    finite-state automata, rules (2) and (3) proved to be redundant as they produced
                    identical outcomes to the general rule under (1*). Because of this, these rules
                    were disregarded in our syllabification algorithm.</p>
                <p>During our early testing of the verbatim implementation of the rule descriptions,
                    we also noticed that the existing rule descriptions treated a consonant cluster
                    consisting of a nasal in initial position followed by a consonant that is not
                    one of the sonorants /<hi rend="italic">j</hi>/, /<hi rend="italic">v</hi>/,
                        /<hi rend="italic">l</hi>/, /<hi rend="italic">ʎ</hi>/, and /<hi
                        rend="italic">r</hi>/ as a part of the following syllable onset, producing
                    outcomes such as: <hi rend="italic">gu-ngula</hi> [<hi rend="italic"
                        >commotion</hi>], <hi rend="italic">mo-mci</hi> [<hi rend="italic"
                    >guys</hi>], <hi rend="italic">ka-ncelarije</hi> [<hi rend="italic"
                    >offices</hi>], <hi rend="italic">su-nce</hi> [<hi rend="italic">sun</hi>], etc.
                    Contrary to <ref target="#Stanojčić.2005">Stanojčić and Popović (2005)</ref>,
                    authors such as <ref target="#Kašić.2014">Kašić (2014)</ref> argue that nasals
                    should be treated analogously to plosives during syllabification because there
                    is a complete occlusion in the oral cavity during their production. If this
                    principle were to be employed, rule (5) should be revised as follows.</p>
                <list type="gloss">
                    <label>(5*)</label>
                    <item><hi rend="italic"> If a consonant cluster consists of a plosive or nasal
                            in its initial position and some other consonant except the
                            sonorants</hi> /<hi rend="italic">j</hi>/, /<hi rend="italic">v</hi>/,
                            /<hi rend="italic">l</hi>/, /<hi rend="italic">ʎ</hi>/, and /<hi
                            rend="italic">r</hi>/,<hi rend="italic"> the syllable boundary will be
                            between the consonants.</hi></item>
                </list>
                <p>Following rule (5*), the examples above would then be segmented as: <hi
                        rend="italic">gun-gula</hi> [<hi rend="italic">commotion</hi>], <hi
                        rend="italic">mom-ci</hi> [<hi rend="italic">guys</hi>], <hi rend="italic"
                        >kan-celarije</hi> [<hi rend="italic">offices</hi>], <hi rend="italic"
                        >sun-ce</hi> [<hi rend="italic">sun</hi>], etc. Even though in the earlier
                    implementation of our syllabifier (<ref target="#Kovač.2018">Kovač and Marković
                        2018</ref>) we did not want to employ the Sonority Sequencing Principle
                    (SSP), we opted for the treatment of nasals by <ref target="#Kašić.2014">Kašić
                        (2014)</ref> in our implementation, which respected the limitations put
                    forward by the Sonority Hierarchy, and was more in line with native speaker
                    intuition.</p>
            </div>
            <div>
                <head>The Sonority Hierarchy</head>
                <p>Sonority Theory accounts for the organization of segments into well-formed
                    sequences, both within the syllable and across syllabic boundaries. This
                    organization is driven by principles of sonority, a property that is used as the
                    basis of ranking all sounds along a scale, from less sonorous to more sonorous
                    ones. Although there is a general consensus that segments are ranked by their
                    inherent sonority, the notion of sonority itself is not unambiguously described
                    in the phonetic and phonological literature. Among the phonetic approaches, <ref
                        target="#Ladefoged.1982">Ladefoged (1982)</ref> defines sonority as the
                    perceptual salience or loudness of a sound, and Bloch and Trager (1942;
                    according to <ref target="#Goldsmith.1995">Goldsmith 1995</ref>) define it as
                    the amount of airflow in the resonance chamber. For others, sonority is
                    dependent on multiple phonetic parameters (<ref target="#Ohala.1984">Ohala and
                        Kawasaki 1984</ref>; <ref target="#Ohala.1990">Ohala 1990</ref>; <ref
                        target="#Butt.1992">Butt 1992</ref>). In the phonological literature,
                    sonority is generally defined as a multi-valued feature (<ref
                        target="#Foley.1972">Foley 1972</ref>; <ref target="#Hankamer.1974">Hankamer
                        and Aissen 1974</ref>; <ref target="#Selkirk.1984">Selkirk 1984</ref>),
                    although there are also authors who argue that it is derivable from the more
                    basic binary features of phonological theory (<ref target="#Clements.1990"
                        >Clements 1990</ref>). Other questions that are often addressed are whether
                    sonority scales are universal or language-specific, allowing freedom to
                    languages in assigning sonority values, and how fine-grained distinctions
                    sonority scales should capture. For example, Clements’ universal sonority scale
                    includes only four major classes of consonants (<ref target="#Clements.1990"
                        >Clements 1990</ref>), ranked from least sonorous to most sonorous, as in
                    (i):<lb/></p>
                <list type="gloss">
                    <label>(i)</label>
                    <item>O &lt; N &lt; L &lt; G <lb/> (O=obstruents, N=nasals, L=liquids,
                        G=glides)</item>
                </list>
                <p><ref target="#Selkirk.1984">Selkirk (1984, 112)</ref> proposes a much more
                    detailed scale, which divides all sounds into 11 groups, assuming more subtle
                    differences in sonority values. Selkirk also states that the sonority indices
                    may not be as important in themselves as the sonority relations that they
                    express. Selkirk’s scale of sonority in consonants is given in (ii):</p>
                <list type="gloss">
                    <label>(ii)</label>
                    <item>p, t, k &lt; b, d, g &lt; f, θ &lt; v, z, ð &lt; s &lt; m, n &lt; l &lt;
                        r</item>
                </list>
                <p>Sonority scales serve as the basis of constructing segment sequences within
                    syllables. The universal cross-linguistic generalization is that in the sequence
                    of segments, the one ranking highest on the sonority scale constitutes the peak
                    of the syllable, i.e. it is the syllabic nucleus. As for the other segments
                    around the nucleus, they are organized so that the more sonorous ones are closer
                    to the nucleus, and less sonorous ones are more distant. This generalization is
                    referred to as Sonority Sequencing Principle (SSP). Thus a syllable with an
                    ascending sonority slope in the onset and a descending slope in the coda, such
                    as, for example <hi rend="italic">blunt</hi>, is a well-formed syllable, whereas
                        *<hi rend="italic">lbutn</hi> is prohibited, due to the violation of the
                    SSP. Adopting thee SSP often solves the problems of syllabic consonants, since
                    they generally occur in environments where they constitute a sonority peak, as
                    in the Serbian word <hi rend="italic">pr-vi</hi>.<lb/></p>
            </div>
            <div>
                <head>The Need for Sonority</head>
                <p>Apart from the segmentation of nasals analogously to plosives following <ref
                        target="#Kašić.2014">Kašić (2014)</ref> that relied on principles of the
                    SSP, in our initial attempt at the formalization of the rule description under
                    (8) of <ref target="#Stanojčić.2005">Stanojčić and Popović (2005)</ref> we had
                    to rely on sonority to define the criteria for when the alveolar sonorants /<hi
                        rend="italic">l</hi>/ and /<hi rend="italic">n</hi>/ act as syllable nuclei. </p>
                <p>As Stanojčić and Popović gave no formal criteria defining the contexts of
                    syllable bearing /<hi rend="italic">l</hi>/ and /<hi rend="italic">n</hi>/, our
                    initial attempt to draw on generalizations based on their examples for syllable
                    carrying /<hi rend="italic">l</hi>/ (<hi rend="italic">Stlp, Vlča glava, Žlne,
                        Vlava, Plzen</hi>) and /<hi rend="italic">n</hi>/ (<hi rend="italic">Idn,
                        Ibn-Saud</hi>). In analogy to the rules descriptions under (7a) and (7b) and
                    our added rule (7c*) defining the contexts in which the alveolar phoneme /<hi
                        rend="italic">r</hi>/ can act as a syllable nucleus, we implemented rule
                    (8*) to define the conditions under which the phonemes /<hi rend="italic"
                    >l</hi>/ and /<hi rend="italic">n</hi>/ can act as syllable bearing nuclei.</p>
                <list type="gloss">
                    <label>(8*)</label>
                    <item><hi rend="italic"> The other two alveolar sonorants, /</hi>l<hi
                            rend="italic">/ and /</hi>n<hi rend="italic">/, can be syllable carriers
                            if they are found:</hi>
                        <list rend="ordered:a">
                            <item><hi rend="italic"> medially between two consonants,</hi></item>
                            <item><hi rend="italic">initially before a consonant, or</hi></item>
                            <item><hi rend="italic">finally after a consonant.</hi></item>
                        </list>
                    </item>
                </list>
                <p>However, the formulation under (8*) allowed for outcomes such as: <hi
                        rend="italic">Be-rn</hi>, <hi rend="italic">Ka-rl</hi>, <hi rend="italic"
                        >erla-jn</hi>, <hi rend="italic">Kla-jn</hi>, <hi rend="italic"
                        >kasa-rn-skim</hi>, <hi rend="italic">Linko-ln</hi>, <hi rend="italic"
                        >Va-jl-om</hi>, etc. in which the phonemes /<hi rend="italic">l</hi>/ and
                        /<hi rend="italic">n</hi>/ identified as syllable nuclei have a lower
                    sonority level than the consonants in their onset or coda. Because the phonemes
                        /<hi rend="italic">r</hi>/ and /<hi rend="italic">j</hi>/ are more sonorous
                    than the phonemes /<hi rend="italic">l</hi>/ and /<hi rend="italic">n</hi>/, and
                    the lateral phoneme /<hi rend="italic">l</hi>/ is more sonorous than the nasal
                    phoneme /<hi rend="italic">n</hi>/, native speakers do not perceive the elements
                    of lower sonority as syllable nuclei in these contexts. <ref target="#Zec.2000"
                        >Zec (2000)</ref> states that alveolar sonorants can be syllable bearing
                    elements in Serbian only in contexts in which there is no segment of a higher
                    level of sonority in their immediate vicinity. Because of this, we needed to
                    further specify rule (8*) to take sonority constraints into consideration as
                    follows.</p>
                <list type="gloss">
                    <label>(8**)</label>
                    <item><hi rend="italic"> The other two alveolar sonorants, /</hi>l<hi
                            rend="italic">/ and /</hi>n<hi rend="italic">/, can be syllable carriers
                            if they are found:</hi>
                        <list rend="ordered:a">
                            <item><hi rend="italic">medially between two consonants of lower
                                    sonority,</hi></item>
                            <item><hi rend="italic">initially before a consonant of lower sonority,
                                    or</hi></item>
                            <item><hi rend="italic">finally after a consonant of lower
                                    sonority.</hi></item>
                        </list>
                    </item>
                </list>
                <p>It turns out that this principle can also account for the behavior of the
                    syllable bearing /<hi rend="italic">r</hi>/ in Serbian. In fact, it does not
                    only provide a general account for consonantal syllabic nuclei in Serbian that
                    subsumes the rules under (7) and (8**) it also accounts for our extension of
                    rule (7) that keeps the the consonant cluster /<hi rend="italic">rje</hi>/ of
                    the ijekavica dialect unsegmented in initial position<note place="foot"
                        xml:id="ftn4" n="2"> It should be noted that while sonority sequencing
                        accounts for the non-syllabic treatment of /<hi rend="italic">r</hi>/ before
                            /<hi rend="italic">je</hi>/ in initial position, our rule extension is
                        still needed as it has a more general scope than the sonority rule and
                        accounts for segmentation in medial positions as well (e.g. in words such as
                            <hi rend="italic">isko-rje-nilo</hi> [<hi rend="italic"
                        >eradicated</hi>]).</note>. Because the phoneme /<hi rend="italic">j</hi>/
                    has a higher level of sonority than /<hi rend="italic">r</hi>/, the phoneme /<hi
                        rend="italic">r</hi>/ should not be treated as a syllable nucleus initially
                    in words such as <hi rend="italic">rjeka </hi>[<hi rend="italic"
                    >river</hi>].</p>
                <p>In our previous implementation of the syllabifier (<ref target="#Kovač.2018"
                        >Kovač and Marković 2018</ref>), we attempted to limit our reliance on the
                    Sonority Sequencing Principle to the cases above. However, during the evaluation
                    of our algorithm, we encountered a number of syllable structures that were
                    unexpected due to their absence from the onset maximization approach to
                    syllabification developed for Croatian by <ref target="#Meštrović.2015"
                        >Meštrović et al. (2015)</ref>. Namely, we encountered the syllable
                    structure CCCCCVC in <hi rend="italic">mo-na-</hi><hi rend="italic underline"
                        >rhstvom</hi> [<hi rend="italic">with the monarchy</hi>], the structure
                    CCCCCV in the words <hi rend="italic">se-</hi><hi rend="italic underline"
                        >rbska</hi> [<hi rend="italic">Serbian</hi>], <hi rend="italic">ca-</hi><hi
                        rend="italic underline">rstva</hi> [<hi rend="italic">kingdoms</hi>], and
                        <hi rend="italic">sta-ra-te-</hi><hi rend="italic underline">ljstva</hi>
                        [<hi rend="italic">custody</hi>], and the structure CCCCVC in <hi
                        rend="italic">se-</hi><hi rend="italic underline">rbskom</hi> [<hi
                        rend="italic">Serbian</hi>], <hi rend="italic">de-</hi><hi
                        rend="italic underline">jstvom</hi> [<hi rend="italic">with effect</hi>],<hi
                        rend="italic"> vo-</hi><hi rend="italic underline">đstvom</hi> [<hi
                        rend="italic">leadership</hi>], <hi rend="italic">spo-</hi><hi
                        rend="italic underline">rtskim</hi> [<hi rend="italic">sport</hi>], and <hi
                        rend="italic">a-</hi><hi rend="italic underline">lpskog</hi> [<hi
                        rend="italic">alpine</hi>].</p>
                <p>The way we attempted to remedy this issue was to limit the syllable onset length
                    three-syllable clusters, which is the maximum length of non-syllabic consonant
                    clusters word initially in Serbian (<ref target="#Kašić.2014">Kašić 2014</ref>).
                    While this constraint, in combination with rules (5) and (6), resolved the
                    issues in the examples we encountered — with this limitation, they are segmented
                    as <hi rend="italic">mo-narh-stvom </hi>[<hi rend="italic">with the
                        monarchy</hi>], <hi rend="italic">serb-ska </hi>[<hi rend="italic"
                        >Serbian</hi>] (three-syllable onset limitation + rule (5)), <hi
                        rend="italic">car-stva </hi>[<hi rend="italic">kingdoms</hi>], <hi
                        rend="italic">sta-ra-telj-stva</hi> [<hi rend="italic">custody</hi>], <hi
                        rend="italic">serb-skom</hi> [<hi rend="italic">Serbian</hi>],<hi
                        rend="italic"> dej-stvom</hi> [<hi rend="italic">with effect</hi>], <hi
                        rend="italic">vođ-stvom </hi>[<hi rend="italic">leadership</hi>], <hi
                        rend="italic">sport-skim </hi>[<hi rend="italic">sport</hi>], <hi
                        rend="italic">alp-sko</hi>g [<hi rend="italic">alpine</hi>] — some medial
                    clusters with a syllabic consonant still remained a problem. For example, in the
                    word <hi rend="italic">najstrpljiviji </hi>[<hi rend="italic">most
                    patient</hi>], which contains a syllabic /<hi rend="italic">r</hi>/, the
                    syllable boundary that would be placed between /<hi rend="italic">na</hi>/ and
                        /<hi rend="italic">jstr</hi>/ — <hi rend="italic">na-jstr-pljiviji</hi> —
                    which does not coincide with native speaker intuition. The Sonority Sequencing
                    Principle seems like a perfect solution for this cases, as it would require the
                    structure of a syllable to follow a sonority scale, with the syllable nucleus
                    being the most sonorous element, while sonority would gradually decrease towards
                    the periphery of the syllable (<ref target="#Zec.2000">Zec 2000</ref>). With
                    this added sonority requirement, the phoneme /<hi rend="italic">j</hi>/, being
                    more sonorous than /<hi rend="italic">s</hi>/ and /<hi rend="italic">t</hi>/,
                    would have to constitute a part of the previous syllable where it would be of a
                    lower sonority when compared to its neighbouring syllable bearing vowel, and the
                    syllable boundary would be <hi rend="italic">naj-str-pljiviji </hi>which is in
                    line with native speaker intuition.</p>
                <p>As a final check following rules (1)–(8**), we add rule (9) that has the ability
                    to shift the syllable boundary in order to avoid a violation of the sonority
                    hierarchy.</p>
                <list type="gloss">
                    <label>(9)</label>
                    <item><hi rend="italic"> If the syllable structure resulting from rules
                            (1)–(8**) does not conform to the Sonority Sequencing Principle, move
                            the boundary so that the phoneme violating the sonority sequence is
                            shifted into the neighboring syllable.</hi></item>
                </list>
            </div>
            <div>
                <head>An Adapted Sonority Hierarchy</head>
                <p>In our sonority sequencing module, we relied on a combination of <ref
                        target="#Selkirk.1984">Selkirk’s (1984)</ref> sonority scale, the sonority
                    apertures for Serbian described by <ref target="#Subotić.2012">Subotić et al.
                        (2012)</ref>, and some notes on sonority sequencing in Serbian from <ref
                        target="#Zec.2000">Zec (2000)</ref>. Our sonority scale is shown under
                    (iii).</p>
                <list type="gloss">
                    <label>(iii)</label>
                    <item>p, t, k &lt; b, d, g &lt; ts, tʃ, tɕ &lt; f, ʃ, h &lt; v, z, ʒ &lt; s &lt;
                        m, n, ɲ &lt; l, ʎ &lt; j, r &lt; a, e, i o, u</item>
                </list>
                <p>The highest sonority group in our implementation was made up by the vowels of
                    Serbian. As vowels constitute syllable nuclei and there can only be a single
                    vowel per syllable, we did not need to make a distinction between three sonority
                    apertures of vowels (i, u &lt; e, o &lt; a) as it is the case in the hierarchy
                    of <ref target="#Subotić.2012">Subotić et al. (2012)</ref>. Following <ref
                        target="#Selkirk.1984">Selkirk (1984)</ref>, we divided sonorants into three
                    sonority classes, and following <ref target="#Zec.2000">Zec (2000)</ref>, we
                    treated liquids as more sonorous than nasals, and, within liquids, the phoneme
                        /<hi rend="italic">r</hi>/ as more sonorous than laterals. For the needs of
                    our implementation, we treated the phoneme /<hi rend="italic">r</hi>/ and glide
                        /<hi rend="italic">j</hi>/ as a single sonority group, although from a
                    theoretical standpoint /<hi rend="italic">j</hi>/ would be considered as more
                    sonorous out of the two given its semi-vowel nature. We opted for treating /s/
                    as an element of higher sonority than voiced fricative despite its voiceless
                    nature following <ref target="#Selkirk.1984">Selkirk (1984)</ref>, and expanded
                    Selkirk’s hierarchy with the addition of affricates between voiceless fricatives
                    and voiced plosives as a parallel to the aperture order presented by <ref
                        target="#Subotić.2012">Subotić et al. (2012)</ref>.</p>
                <p>It is important to note that there are sequences which clearly do not conform
                    with the SSP in a number of languages, and which may undermine the relevance and
                    power of the sonority hierarchy. A very common pattern, found across a number of
                    unrelated languages, is the possibility of an /<hi rend="italic">s</hi>/ +
                    plosive sequence in the syllable onset, which would be in clear violation if we
                    were to adopt the sonority scale outlined above. In Serbian, there is a known
                    ambiguity in syllable segmentation in the case of continuant fricative phonemes.
                    For example, the word <hi rend="italic">postaviti </hi>[<hi rend="italic">to
                        set</hi>] can be syllabified as both <hi rend="italic">po-sta-vi-ti</hi> and
                        <hi rend="italic">pos-ta-vi-ti</hi> (<ref target="#Gvozdanović.2011"
                        >Gvozdanović 2011</ref>). We therefore adopt the view put forward in <ref
                        target="#Morelli.1999">Morelli (1999)</ref>, who argues that fricatives and
                    plosives may be treated as a single class with respect to sonority in these
                    cases — since splitting them into separate classes would make wrong typological
                    predictions — and add an exception to our sonority sequencing module that leaves
                    fricative + plosive sequences as a viable sequence in the syllable onset.</p>
            </div>
            <div>
                <head>Our Algorithm<note place="foot" xml:id="ftn5" n="3"> Our implementation of the
                        algorithm can be found at <ref
                            target="https://github.com/versi-regular/rule-based_syllabifier_sr"
                            >https://github.com/versi-regular/rule-based_syllabifier_sr</ref>,
                        licensed under the GNU General Public License v3.0. It was developed using
                        Python 3.x and processes 10380 tokens/s on average estimated on a 4,681,713
                        token corpus processed on an Intel® Core™ i5-3210M CPU @ 2.50GHz with 8.00
                        GBs of DDR3L-1600 SODIMM, including pre-processing, clean-up, and
                        transliteration.</note></head>
                <p>Our mixed-principle syllabification algorithms consists of the following
                    steps:</p>
                <list rend="ordered:I">
                    <item n="I">Identify vowels in the word and mark their positions as positions
                        capable of constituting syllable nuclei (based on (1)).</item>
                    <item n="II">If a word contains the letters<hi rend="italic"> l</hi>, <hi
                            rend="italic">n</hi> or the letter<hi rend="italic"> r</hi> not followed
                        by the sequence <hi rend="italic">je</hi> in the center of a consonant
                        cluster consisting of elements of lower sonority or at the beginning or a
                        word followed by a consonant of lower sonority, or the letters <hi
                            rend="italic">l</hi> or <hi rend="italic">n</hi> at the end of a word
                        preceded by a consonant of lower sonority, treat those positions in the word
                        as capable of constituting syllable nuclei (based on (1*), (7), and
                        (8**)).</item>
                    <item n="III">For each position identified as capable of constituting a syllable
                            nucleus:<list rend="ordered:A">
                            <item n="A.">If it is followed by a sequence of two sonorants, mark the
                                syllable boundary between the two sonorants (based on (4)), except
                                if the second sonorant is <hi rend="italic">j</hi> and it is
                                followed by <hi rend="italic">e</hi>. If the second sonorant is <hi
                                    rend="italic">j</hi> followed by <hi rend="italic">e</hi>, mark
                                the syllable boundary before the sonorant cluster (based on
                                (6)).</item>
                            <item n="B.">If it is followed by a sequence of a plosive or nasal and a
                                plosive, fricative, affricate or nasal, mark the syllable boundary
                                between the two consonants (based on (5*)).</item>
                            <item n="C.">In all other cases mark the syllable boundary after the
                                syllable nucleus (based on (1*)).</item>
                        </list></item>
                    <item n="IV">Run a recursive sonority check (based on (9)):<list
                            rend="ordered:A">
                            <item>If the word consists of more than one syllable, convert the
                                syllable structures identified by the previous steps into sonority
                                group values.</item>
                            <item>For each syllable, check if there is a violation of the SSP at the
                                edges of the syllable ignoring the check at the onset on the
                                word-initial syllable and the check in the coda of the word-final
                                syllable.</item>
                            <item>If a violation found is a sequence of a fricative followed by a
                                plosive in the onset, ignore the violation.</item>
                            <item>If there is a violation, remove the letter from the edge of the
                                syllable, and add it onto the neighboring syllable.</item>
                            <item>Repeat until no violation is found.</item>
                        </list></item>
                </list>
            </div>
            <div>
                <head>Syllable Distribution Data</head>
                <p>In this section, we present the statistical distribution data of syllables in
                    Serbian based on our updated syllabification process applied to the Serbian
                    Lemmatized and PoS Annotated Corpus <hi rend="italic">SrpLemKor</hi> (<ref
                        target="#Popović.2010">Popović 2010</ref>; <ref target="#Utvić.2011">Utvić
                        2011</ref>). We chose <hi rend="italic">SrpLemKor </hi>for our analysis,
                    because its annotation allowed us to filter out numbers, Roman numerals,
                    abbreviations and non-Serbian words or suffixes in compounds (at least to some
                    extent) and thus reduce noise in the data.</p>
                <p>The following results show the syllable distribution statistics based on
                    3,648,543 non-unique word-forms (word tokens) from <hi rend="italic"
                        >SrpLemKor</hi>. From a total of 4,681,713 entities (punctuation and word
                    tokens) in our version of the corpus, 113,679 (2.43%) entities of texts #260,
                    #4505 and #4517 were excluded because the files contained faulty encoding. Based
                    on corpus tags, we excluded 919,161 (19.63%) entities tagged PUNCT
                    (punctuation), SENT (sentence separator full-stops), RN (Roman numerals), NUM
                    @card@ (Arabic numerals), ABB (abbreviations) and ? (non-Serbian words and other
                    uncategorized entries). An additional 815 (0.02%) entities that contained the
                    characters w, q and x were removed in an attempt to further reduce noise
                    stemming from foreign words, as not all foreign words were tagged as such in the
                    corpus. In the process of syllabification, an additional 12,877 (0.28%) entities
                    were removed as they were solely made up of consonant clusters with no available
                    syllable nucleus candidate.</p>
            </div>
            <div>
                <head>Syllable Type Distributions in Serbian</head>
                <p>In the 3,648,543 word-forms from <hi rend="italic">SrpLemKor</hi>, a total of
                    8,196,771 syllables were identified. Table 1 presents the syllable type
                    distribution based on our mixed-principle syllabification algorithm.</p>
                <table rend="table-scroll">
                    <head>Table 1: Syllable structure distribution of syllables in the <hi
                        rend="italic">SrpLemKor </hi>corpus</head>
                    <row role="label">
                        <cell>Syllable structure</cell>
                        <cell style="text-align:center;">No. of instances</cell>
                        <cell style="text-align:center;">Percent</cell>
                    </row>
                    <row>
                        <cell>CV</cell>
                        <cell style="text-align:right;">5030622</cell>
                        <cell style="text-align:right;">61,37322</cell>
                    </row>
                    <row>
                        <cell>CCV</cell>
                        <cell style="text-align:right;">938275</cell>
                        <cell style="text-align:right;">11,44689</cell>
                    </row>
                    <row>
                        <cell>CVC</cell>
                        <cell style="text-align:right;">913603</cell>
                        <cell style="text-align:right;">11,14589</cell>
                    </row>
                    <row>
                        <cell>V</cell>
                        <cell style="text-align:right;">852854</cell>
                        <cell style="text-align:right;">10,40476</cell>
                    </row>
                    <row>
                        <cell>CCVC</cell>
                        <cell style="text-align:right;">218126</cell>
                        <cell style="text-align:right;">2,661121</cell>
                    </row>
                    <row>
                        <cell>VC</cell>
                        <cell style="text-align:right;">141980</cell>
                        <cell style="text-align:right;">1,732145</cell>
                    </row>
                    <row>
                        <cell>CCCV</cell>
                        <cell style="text-align:right;">56168</cell>
                        <cell style="text-align:right;">0,685245</cell>
                    </row>
                    <row>
                        <cell>CVCC</cell>
                        <cell style="text-align:right;">20339</cell>
                        <cell style="text-align:right;">0,248134</cell>
                    </row>
                    <row>
                        <cell>CCCVC</cell>
                        <cell style="text-align:right;">14362</cell>
                        <cell style="text-align:right;">0,175215</cell>
                    </row>
                    <row>
                        <cell>CCVCC</cell>
                        <cell style="text-align:right;">6274</cell>
                        <cell style="text-align:right;">0,076542</cell>
                    </row>
                    <row>
                        <cell>VCC</cell>
                        <cell style="text-align:right;">2234</cell>
                        <cell style="text-align:right;">0,027255</cell>
                    </row>
                    <row>
                        <cell><hi rend="color(AEAAAA)">CCCCV</hi></cell>
                        <cell style="text-align:right;"><hi rend="color(AEAAAA)">780</hi></cell>
                        <cell style="text-align:right;"><hi rend="color(AEAAAA)">0,009516</hi></cell>
                    </row>
                    <row>
                        <cell>CVCCC</cell>
                        <cell style="text-align:right;">731</cell>
                        <cell style="text-align:right;">0,008918</cell>
                    </row>
                    <row>
                        <cell>CCCVCC</cell>
                        <cell style="text-align:right;">170</cell>
                        <cell style="text-align:right;">0,002074</cell>
                    </row>
                    <row>
                        <cell><hi rend="color(AEAAAA)">CCCCVC</hi></cell>
                        <cell style="text-align:right;"><hi rend="color(AEAAAA)">84</hi></cell>
                        <cell style="text-align:right;"><hi rend="color(AEAAAA)">0,001025</hi></cell>
                    </row>
                    <row>
                        <cell>VCCC</cell>
                        <cell style="text-align:right;">67</cell>
                        <cell style="text-align:right;">0,000817</cell>
                    </row>
                    <row>
                        <cell>CCCCVC</cell>
                        <cell style="text-align:right;">36</cell>
                        <cell style="text-align:right;">0,000439</cell>
                    </row>
                    <row>
                        <cell>Other</cell>
                        <cell style="text-align:right;">66</cell>
                        <cell style="text-align:right;">0,000805</cell>
                    </row>
                    <row>
                        <cell>Total</cell>
                        <cell style="text-align:right;">8196771</cell>
                        <cell style="text-align:right;">100</cell>
                    </row>
                </table>
                <p>These results show the distribution of syllables in a somewhat noisy data. We
                    found there are still foreign words annotated as non-foreign in the corpus
                    constituting some of the less-frequent syllable structures listed as “Other” in
                    Table 1. For example, an instance of the syllable structure VCCCCC was found to
                    correspond to the segmentation of the German word <hi rend="italic">Pe-</hi><hi
                        rend="italic underline">itscht</hi> [<hi rend="italic">lashes</hi>], the
                    syllable structure CCCCVCCC was identified in the German word <hi rend="italic"
                        >Fle-i-</hi><hi rend="italic underline">schmarkt</hi> [<hi rend="italic"
                        >meat market</hi>], and the structure CCCCCVC was found in the German word
                        <hi rend="italic">Gle-i-</hi><hi rend="italic underline">chschal</hi><hi
                        rend="italic">-tung</hi> [<hi rend="italic">co-ordination</hi>]. The
                    structure CCCCCCVC was found in the German word <hi rend="italic">Na-</hi><hi
                        rend="italic underline">chtschat</hi><hi rend="italic">-ten</hi> [<hi
                        rend="italic">nightshade</hi>] and in the toponym <hi rend="italic"
                        >CRYSLER</hi>. The syllable structure CCVCCCC was found in the source
                    transcription of the last name <hi rend="italic">Pe-</hi><hi
                        rend="italic underline">tritsch</hi> and in the English word <hi
                        rend="italic">knights</hi>. The syllable structure CCCVCCC was identified to
                    be a part of the German words <hi rend="italic">Wol-</hi><hi
                        rend="italic underline">fsmilch</hi> [<hi rend="italic">spurge</hi>] and <hi
                        rend="italic">E-in-ge-</hi><hi rend="italic underline">schickt</hi> [<hi
                        rend="italic">sent in</hi>] and to correspond to the English word <hi
                        rend="italic">string</hi>. The syllable structure CCCCCCV was identified in
                    the German words <hi rend="italic">We-i-hna-</hi><hi rend="italic underline"
                        >chtsbra</hi><hi rend="italic">-e-u-che</hi> [<hi rend="italic">Christmas
                        trees</hi>], <hi rend="italic">Stor-</hi><hi rend="italic underline"
                        >chschna</hi><hi rend="italic">-bel</hi> [Crane’s bill], while the structure
                    CCCCCV was found in the words <hi rend="italic">Re-</hi><hi
                        rend="italic underline">chtsge</hi><hi rend="italic">-schi-chte</hi> [<hi
                        rend="italic">history of law</hi>] and <hi rend="italic">Um-gan-</hi><hi
                        rend="italic underline">gsspra</hi><hi rend="italic">-che</hi> [<hi
                        rend="italic">vernacular</hi>], as well as in the sequences <hi
                        rend="italic">šttske</hi> and <hi rend="italic">su-</hi><hi
                        rend="italic underline">žnjstva</hi>. The syllable structure CCCCVCC was
                    found in the German word <hi rend="italic">Ze-it-</hi><hi
                        rend="italic underline">schrift</hi> [<hi rend="italic">magazine</hi>], and
                    in multiple occurrences of the source spelling of the last names <hi
                        rend="italic">Schmidt</hi> and <hi rend="italic">Rot-</hi><hi
                        rend="italic underline">hchild</hi>. The structure VCCCC was found in the
                    German words <hi rend="italic">Deutsch</hi> [<hi rend="italic">German</hi>], <hi
                        rend="italic">Ernst </hi>[<hi rend="italic">seriousness</hi>], in the
                    sequence <hi rend="italic">der-demnaechst</hi> [<hi rend="italic">soon</hi>],
                    and in the strings <hi rend="italic">ikvbv </hi>and <hi rend="italic"
                    >EHCmc</hi>. As can be seen from the examples above, besides foreign origin
                    words, noise in the data can also be found in typos and strings we did not
                    manage to identify. Another example of such string was <hi rend="italic"
                        >ngBpJKTnQ </hi>identified as the structure VCCCCCCCC. Most structures
                    identified as CVCCCC were the result of typos, e.g. serbsk, kra-levstv,
                    pod-danstv, carstv, slav-jansk, ju-go-slo-venskg, cr-no-gorskg, but also foreign
                    origin names, e.g. <hi rend="italic">Hirsch, Herbst</hi>, <hi rend="italic"
                        >Lokotsch</hi>, and <hi rend="italic">Worlds</hi> in additions to strings
                    such as <hi rend="italic">majnds and Gorrrr</hi>. In addition to these, one
                    occurrence of the syllable structure CVCCCCCCCC that stood for the onomatopoeic
                    vulgarism <hi rend="italic">mršššššššš</hi> [<hi rend="italic">go
                    away</hi>].</p>
                <p>We also found 2 syllable structures that differed from the structures found by
                        <ref target="#Meštrović.2015">Meštrović et al. (2015)</ref> for Croatian.
                    The structure CCCCVC was identified in the words <hi rend="italic">vo-</hi><hi
                        rend="italic underline">đstvom</hi> [<hi rend="italic">with
                    leadership</hi>], <hi rend="italic">za-ko-no-da-</hi><hi rend="italic underline"
                        >vstvom</hi> [<hi rend="italic">with legislature</hi>], <hi rend="italic"
                        >mo-nar-hstvom</hi> [<hi rend="italic">with monkhood</hi>], <hi
                        rend="italic">lu-ka-</hi><hi rend="italic underline">vstvom</hi> [<hi
                        rend="italic">with slyness</hi>], <hi rend="italic">be-zzglob-na</hi> [<hi
                        rend="italic">without wrists</hi>], and in the paradigm members of the word
                        <hi rend="italic">po-</hi><hi rend="italic underline">sthlad</hi><hi
                        rend="italic">-no-ra-to-vski</hi> [<hi rend="italic">post-cold-war</hi>]. It
                    also occurred in the Russian word <hi rend="italic">Zdra-</hi><hi
                        rend="italic underline">vstvuj</hi> [hello], in the German-origin word <hi
                        rend="italic">Ha-up-</hi><hi rend="italic underline">tstrum</hi><hi
                        rend="italic">-fi-rer </hi>[<hi rend="italic">mid-level commander</hi>], in
                    the German <hi rend="italic">Ra-u-</hi><hi rend="italic underline"
                        >schmit</hi><hi rend="italic">-tel</hi> [<hi rend="italic">intoxicant</hi>]
                    and <hi rend="italic">Li-e-be-</hi><hi rend="italic underline">spflan</hi><hi
                        rend="italic">-ze</hi> [<hi rend="italic">love plant</hi>] and in the
                    misspelled Serbian words <hi rend="italic">pri-ja-</hi><hi
                        rend="italic underline">tljskih</hi> [<hi rend="italic">friendly</hi>] and
                        <hi rend="italic">kvdrat </hi>[<hi rend="italic">square</hi>]. The structure
                    CCCCV was found in the words <hi rend="italic">bi-</hi><hi
                        rend="italic underline">vstvu</hi> [<hi rend="italic">existence</hi>], <hi
                        rend="italic">va-zdu-ho-plo-</hi><hi rend="italic underline">vstvo</hi> [<hi
                        rend="italic">aviation</hi>], <hi rend="italic">kra-lje-</hi><hi
                        rend="italic underline">vstva</hi> [<hi rend="italic">kingdoms</hi>], <hi
                        rend="italic">zdra-</hi><hi rend="italic underline">vstve</hi><hi
                        rend="italic">-noj</hi> [<hi rend="italic">health</hi>], <hi rend="italic"
                        >vo-</hi><hi rend="italic underline">đstvo</hi> [<hi rend="italic"
                        >leadership</hi>], <hi rend="italic">ču-</hi><hi rend="italic underline"
                        >vstva</hi> [<hi rend="italic">feeling</hi>], <hi rend="italic"
                        >pre-i-mu-</hi><hi rend="italic underline">ćstva</hi> [<hi rend="italic"
                        >advantages</hi>], and <hi rend="italic">mo-gu-</hi><hi
                        rend="italic underline">ćstvu</hi> [<hi rend="italic">possibility</hi>]. It
                    also occurred in German words such as <hi rend="italic">Pfin-</hi><hi
                        rend="italic underline">gstro</hi><hi rend="italic">-se </hi>[<hi
                        rend="italic">peony</hi>], <hi rend="italic">Ke-u-</hi><hi
                        rend="italic underline">schhe</hi><hi rend="italic">-it </hi>[<hi
                        rend="italic">chastity</hi>], <hi rend="italic underline">Schne</hi><hi
                        rend="italic">-e-glo-ec-kchen </hi>[<hi rend="italic">snowdrop</hi>], <hi
                        rend="italic underline">Schne</hi><hi rend="italic">-e-ro-se </hi>[<hi
                        rend="italic">Chrismas rose</hi>], <hi rend="italic">Ge-i-</hi><hi
                        rend="italic underline">sskle</hi><hi rend="italic">-e</hi> [<hi
                        rend="italic">cystus</hi>], <hi rend="italic">Vol-</hi><hi
                        rend="italic underline">ksbra</hi><hi rend="italic">-uch</hi> [<hi
                        rend="italic">popular custom</hi>], <hi rend="italic">Vol-</hi><hi
                        rend="italic underline">ksgla</hi><hi rend="italic">-u-ben</hi> [<hi
                        rend="italic">popular belief</hi>], <hi rend="italic underline"
                        >Schri</hi><hi rend="italic">-ften</hi> [<hi rend="italic"
                    >regulations</hi>], <hi rend="italic underline">Schlu</hi><hi rend="italic"
                        >-e-ssel-blu-me</hi> [<hi rend="italic">cowslip</hi>], and more<hi
                        rend="italic">.</hi> We discuss the implications of these for our
                    syllabification algorithm in the Discussion section below.</p>
            </div>
            <div>
                <head>Syllable Type Positional Distributions in Serbian</head>
                <p>We also examined the syllable type frequencies with respect to their position in
                    a word. Four positional frequencies are presented in Table 2: syllable type
                    frequencies in monosyllabic words, and syllables type frequencies in the initial
                    position, in medial positions, and in the final position of polysyllabic
                    words.</p>
                <table rend="table-scroll">
                    <head>Table 2: Syllable structure distribution of syllables in the <hi
                        rend="italic">SrpLemKor </hi>corpus categorized by position</head>
                    <row role="label">
                        <cell style="text-align:center; border:0.5px solid #333333;" rows="3">Syllable structure</cell>
                        <cell style="text-align:center; border:0.5px solid #333333;" cols="2">Monosyllabic
                            words</cell>
                        <cell style="text-align:center; border:0.5px solid #333333;" cols="6">Polysyllabic
                            words</cell>
                    </row>
                    <row role="label">
                        <cell style="text-align:center; border:0.5px solid #333333;" cols="2">MONO</cell>
                        <cell style="text-align:center; border:0.5px solid #333333;" cols="2">INITIAL</cell>
                        <cell style="text-align:center; border:0.5px solid #333333;" cols="2">MEDIAL</cell>
                        <cell style="text-align:center; border:0.5px solid #333333;" cols="2">FINAL</cell>
                    </row>
                    <row role="label">
                        <cell style="text-align:center; border:0.5px solid #333333;">No. of instances</cell>
                        <cell style="text-align:center; border:0.5px solid #333333;">Percent</cell>
                        <cell style="text-align:center; border:0.5px solid #333333;">No. of instances</cell>
                        <cell style="text-align:center; border:0.5px solid #333333;">Percent</cell>
                        <cell style="text-align:center; border:0.5px solid #333333;">No. of instances</cell>
                        <cell style="text-align:center; border:0.5px solid #333333;">Percent</cell>
                        <cell style="text-align:center; border:0.5px solid #333333;">No. of instances</cell>
                        <cell style="text-align:center; border:0.5px solid #333333;">Percent</cell>
                    </row>
                    <row>
                        <cell>CV</cell>
                        <cell style="text-align:right;">612214</cell>
                        <cell style="text-align:right;">50,382</cell>
                        <cell style="text-align:right;">1356771</cell>
                        <cell style="text-align:right;">56,064</cell>
                        <cell style="text-align:right;">1476732</cell>
                        <cell style="text-align:right;">68,956</cell>
                        <cell style="text-align:right;">1584905</cell>
                        <cell style="text-align:right;">65,49</cell>
                    </row>
                    <row>
                        <cell>CCV</cell>
                        <cell style="text-align:right;">62244</cell>
                        <cell style="text-align:right;">5,122</cell>
                        <cell style="text-align:right;">372181</cell>
                        <cell style="text-align:right;">15,379</cell>
                        <cell style="text-align:right;">305247</cell>
                        <cell style="text-align:right;">14,254</cell>
                        <cell style="text-align:right;">198603</cell>
                        <cell style="text-align:right;">8,21</cell>
                    </row>
                    <row>
                        <cell>CVC</cell>
                        <cell style="text-align:right;">129337</cell>
                        <cell style="text-align:right;">10,644</cell>
                        <cell style="text-align:right;">178859</cell>
                        <cell style="text-align:right;">7,391</cell>
                        <cell style="text-align:right;">211979</cell>
                        <cell style="text-align:right;">9,898</cell>
                        <cell style="text-align:right;">393428</cell>
                        <cell style="text-align:right;">16,26</cell>
                    </row>
                    <row>
                        <cell>V</cell>
                        <cell style="text-align:right;">301295</cell>
                        <cell style="text-align:right;">24,795</cell>
                        <cell style="text-align:right;">369133</cell>
                        <cell style="text-align:right;">15,253</cell>
                        <cell style="text-align:right;">61241</cell>
                        <cell style="text-align:right;">2,860</cell>
                        <cell style="text-align:right;">121185</cell>
                        <cell style="text-align:right;">5,01</cell>
                    </row>
                    <row>
                        <cell>CCVC</cell>
                        <cell style="text-align:right;">35428</cell>
                        <cell style="text-align:right;">2,916</cell>
                        <cell style="text-align:right;">50383</cell>
                        <cell style="text-align:right;">2,082</cell>
                        <cell style="text-align:right;">53397</cell>
                        <cell style="text-align:right;">2,493</cell>
                        <cell style="text-align:right;">78918</cell>
                        <cell style="text-align:right;">3,26</cell>
                    </row>
                    <row>
                        <cell>VC</cell>
                        <cell style="text-align:right;">64038</cell>
                        <cell style="text-align:right;">5,270</cell>
                        <cell style="text-align:right;">67539</cell>
                        <cell style="text-align:right;">2,791</cell>
                        <cell style="text-align:right;">7123</cell>
                        <cell style="text-align:right;">0,333</cell>
                        <cell style="text-align:right;">3280</cell>
                        <cell style="text-align:right;">0,14</cell>
                    </row>
                    <row>
                        <cell>CCCV</cell>
                        <cell style="text-align:right;">174</cell>
                        <cell style="text-align:right;">0,014</cell>
                        <cell style="text-align:right;">19754</cell>
                        <cell style="text-align:right;">0,816</cell>
                        <cell style="text-align:right;">20260</cell>
                        <cell style="text-align:right;">0,946</cell>
                        <cell style="text-align:right;">15980</cell>
                        <cell style="text-align:right;">0,66</cell>
                    </row>
                    <row>
                        <cell>CVCC</cell>
                        <cell style="text-align:right;">5368</cell>
                        <cell style="text-align:right;">0,442</cell>
                        <cell style="text-align:right;">1052</cell>
                        <cell style="text-align:right;">0,043</cell>
                        <cell style="text-align:right;">695</cell>
                        <cell style="text-align:right;">0,032</cell>
                        <cell style="text-align:right;">13224</cell>
                        <cell style="text-align:right;">0,55</cell>
                    </row>
                    <row>
                        <cell>CCCVC</cell>
                        <cell style="text-align:right;">1490</cell>
                        <cell style="text-align:right;">0,123</cell>
                        <cell style="text-align:right;">3976</cell>
                        <cell style="text-align:right;">0,164</cell>
                        <cell style="text-align:right;">4427</cell>
                        <cell style="text-align:right;">0,207</cell>
                        <cell style="text-align:right;">4469</cell>
                        <cell style="text-align:right;">0,18</cell>
                    </row>
                    <row>
                        <cell>CCVCC</cell>
                        <cell style="text-align:right;">1635</cell>
                        <cell style="text-align:right;">0,135</cell>
                        <cell style="text-align:right;">206</cell>
                        <cell style="text-align:right;">0,009</cell>
                        <cell style="text-align:right;">17</cell>
                        <cell style="text-align:right;">0,001</cell>
                        <cell style="text-align:right;">4416</cell>
                        <cell style="text-align:right;">0,18</cell>
                    </row>
                    <row>
                        <cell>VCC</cell>
                        <cell style="text-align:right;">1125</cell>
                        <cell style="text-align:right;">0,093</cell>
                        <cell style="text-align:right;">162</cell>
                        <cell style="text-align:right;">0,007</cell>
                        <cell style="text-align:right;">18</cell>
                        <cell style="text-align:right;">0,001</cell>
                        <cell style="text-align:right;">929</cell>
                        <cell style="text-align:right;">0,04</cell>
                    </row>
                    <row>
                        <cell><hi rend="color(AEAAAA)">CCCCV</hi></cell>
                        <cell style="text-align:right;"><hi rend="color(AEAAAA)">14</hi></cell>
                        <cell style="text-align:right;"><hi rend="color(AEAAAA)">0,001</hi></cell>
                        <cell style="text-align:right;"><hi rend="color(AEAAAA)">21</hi></cell>
                        <cell style="text-align:right;"><hi rend="color(AEAAAA)">0,001</hi></cell>
                        <cell style="text-align:right;"><hi rend="color(AEAAAA)">381</hi></cell>
                        <cell style="text-align:right;"><hi rend="color(AEAAAA)">0,018</hi></cell>
                        <cell style="text-align:right;"><hi rend="color(AEAAAA)">364</hi></cell>
                        <cell style="text-align:right;"><hi rend="color(AEAAAA)">0,02</hi></cell>
                    </row>
                    <row>
                        <cell>CVCCC</cell>
                        <cell style="text-align:right;">579</cell>
                        <cell style="text-align:right;">0,048</cell>
                        <cell style="text-align:right;">3</cell>
                        <cell style="text-align:right;">0,000</cell>
                        <cell style="text-align:right;">1</cell>
                        <cell style="text-align:right;">0,000</cell>
                        <cell style="text-align:right;">148</cell>
                        <cell style="text-align:right;">0,01</cell>
                    </row>
                    <row>
                        <cell>CCCVCC</cell>
                        <cell style="text-align:right;">105</cell>
                        <cell style="text-align:right;">0,009</cell>
                        <cell style="text-align:right;">0</cell>
                        <cell style="text-align:right;">0,000</cell>
                        <cell style="text-align:right;">0</cell>
                        <cell style="text-align:right;">0,000</cell>
                        <cell style="text-align:right;">65</cell>
                        <cell style="text-align:right;">0,00</cell>
                    </row>
                    <row>
                        <cell><hi rend="color(AEAAAA)">CCCCVC</hi></cell>
                        <cell style="text-align:right;"><hi rend="color(AEAAAA)">1</hi></cell>
                        <cell style="text-align:right;"><hi rend="color(AEAAAA)">0,000</hi></cell>
                        <cell style="text-align:right;"><hi rend="color(AEAAAA)">0</hi></cell>
                        <cell style="text-align:right;"><hi rend="color(AEAAAA)">0,000</hi></cell>
                        <cell style="text-align:right;"><hi rend="color(AEAAAA)">25</hi></cell>
                        <cell style="text-align:right;"><hi rend="color(AEAAAA)">0,001</hi></cell>
                        <cell style="text-align:right;"><hi rend="color(AEAAAA)">58</hi></cell>
                        <cell style="text-align:right;"><hi rend="color(AEAAAA)">0,00</hi></cell>
                    </row>
                    <row>
                        <cell>VCCC</cell>
                        <cell style="text-align:right;">45</cell>
                        <cell style="text-align:right;">0,004</cell>
                        <cell style="text-align:right;">0</cell>
                        <cell style="text-align:right;">0,000</cell>
                        <cell style="text-align:right;">0</cell>
                        <cell style="text-align:right;">0,000</cell>
                        <cell style="text-align:right;">22</cell>
                        <cell style="text-align:right;">0,00</cell>
                    </row>
                    <row>
                        <cell>CCCCVC</cell>
                        <cell style="text-align:right;">11</cell>
                        <cell style="text-align:right;">0,001</cell>
                        <cell style="text-align:right;">0</cell>
                        <cell style="text-align:right;">0,000</cell>
                        <cell style="text-align:right;">0</cell>
                        <cell style="text-align:right;">0,000</cell>
                        <cell style="text-align:right;">25</cell>
                        <cell style="text-align:right;">0,00</cell>
                    </row>
                    <row>
                        <cell>Other</cell>
                        <cell style="text-align:right;">38</cell>
                        <cell style="text-align:right;">0,003</cell>
                        <cell style="text-align:right;">0</cell>
                        <cell style="text-align:right;">0,000</cell>
                        <cell style="text-align:right;">7</cell>
                        <cell style="text-align:right;">0,000</cell>
                        <cell style="text-align:right;">21</cell>
                        <cell style="text-align:right;">0,00</cell>
                    </row>
                </table>
                <p>Based on <hi rend="italic">SrpLemKor</hi>, the most frequent monosyllabic
                    syllable structures in Serbian are CV (50%), V (25%) and CVC (11%). The most
                    frequent syllable structures in the initial position of polysyllabic words are
                    CV (56%), CCV (15%) and V (15%). In medial positions in polysyllabic words, the
                    most frequent syllable structures are CV (69%), CCV (14%) and CVC (10%). The
                    most frequent syllable structures in the final position of polysyllabic words
                    are CV (65%), CVC (16%) and CCV (8%). It is interesting to note the asymmetry
                    that the syllable structures CCCVCC, VCCC, and CCCCVC occurred only in
                    monosyllabic words and in the final position of polysyllabic words, while the
                    syllable structure CCCCVC occurred in all positions except the initial position
                    in polysyllabic words.</p>
            </div>
            <div>
                <head>Syllable Nuclei Statistics in Serbian</head>
                <p>The distribution of different syllable nuclei in Serbian based on the <hi
                        rend="italic">SrpLemKor</hi> corpus is presented in Table 3.</p>
                <table rend="table-scroll">
                    <head>Table 3: Syllable nuclei statistics and positional frequencies of
                        syllables in the <hi rend="italic">SrpLemKor</hi> corpus</head>
                    <row role="label">
                        <cell style="text-align:center; border:0.5px solid #333333;" rows="3">Nucleus</cell>
                        <cell style="text-align:center; border:0.5px solid #333333;" cols="2" rows="2">TOTAL</cell>
                        <cell style="text-align:center; border:0.5px solid #333333;" cols="2">Monosyllabic
                            words</cell>
                        <cell style="text-align:center; border:0.5px solid #333333;" cols="6">Polysyllabic
                            words</cell>
                    </row>
                    <row role="label">
                        <cell style="text-align:center; border:0.5px solid #333333;" cols="2">MONO</cell>
                        <cell style="text-align:center; border:0.5px solid #333333;" cols="2">INITIAL</cell>
                        <cell style="text-align:center; border:0.5px solid #333333;" cols="2">MEDIAL</cell>
                        <cell style="text-align:center; border:0.5px solid #333333;" cols="2">FINAL</cell>
                    </row>
                    <row role="label">
                        <cell style="text-align:center; border:0.5px solid #333333;">No. of instances</cell>
                        <cell style="text-align:center; border:0.5px solid #333333;">Percent</cell>
                        <cell style="text-align:center; border:0.5px solid #333333;">No. of instances</cell>
                        <cell style="text-align:center; border:0.5px solid #333333;">Percent</cell>
                        <cell style="text-align:center; border:0.5px solid #333333;">No. of instances</cell>
                        <cell style="text-align:center; border:0.5px solid #333333;">Percent</cell>
                        <cell style="text-align:center; border:0.5px solid #333333;">No. of instances</cell>
                        <cell style="text-align:center; border:0.5px solid #333333;">Percent</cell>
                        <cell style="text-align:center; border:0.5px solid #333333;">No. of instances</cell>
                        <cell style="text-align:center; border:0.5px solid #333333;">Percent</cell>
                    </row>
                    <row>
                        <cell>a</cell>
                        <cell style="text-align:right;">2177498</cell>
                        <cell style="text-align:right;">26,566</cell>
                        <cell style="text-align:right;">330629</cell>
                        <cell style="text-align:right;">27,209</cell>
                        <cell style="text-align:right;">604764</cell>
                        <cell style="text-align:right;">24,990</cell>
                        <cell style="text-align:right;">585787</cell>
                        <cell style="text-align:right;">27,353</cell>
                        <cell style="text-align:right;">656318</cell>
                        <cell style="text-align:right;">27,120</cell>
                    </row>
                    <row>
                        <cell>e</cell>
                        <cell style="text-align:right;">1646579</cell>
                        <cell style="text-align:right;">20,088</cell>
                        <cell style="text-align:right;">304442</cell>
                        <cell style="text-align:right;">25,054</cell>
                        <cell style="text-align:right;">447662</cell>
                        <cell style="text-align:right;">18,498</cell>
                        <cell style="text-align:right;">394573</cell>
                        <cell style="text-align:right;">18,425</cell>
                        <cell style="text-align:right;">499902</cell>
                        <cell style="text-align:right;">20,657</cell>
                    </row>
                    <row>
                        <cell>i</cell>
                        <cell style="text-align:right;">1730439</cell>
                        <cell style="text-align:right;">21,111</cell>
                        <cell style="text-align:right;">230637</cell>
                        <cell style="text-align:right;">18,980</cell>
                        <cell style="text-align:right;">394735</cell>
                        <cell style="text-align:right;">16,311</cell>
                        <cell style="text-align:right;">600823</cell>
                        <cell style="text-align:right;">28,056</cell>
                        <cell style="text-align:right;">504244</cell>
                        <cell style="text-align:right;">20,836</cell>
                    </row>
                    <row>
                        <cell>l</cell>
                        <cell style="text-align:right;">939</cell>
                        <cell style="text-align:right;">0,011</cell>
                        <cell style="text-align:right;">326</cell>
                        <cell style="text-align:right;">0,027</cell>
                        <cell style="text-align:right;">32</cell>
                        <cell style="text-align:right;">0,001</cell>
                        <cell style="text-align:right;">77</cell>
                        <cell style="text-align:right;">0,004</cell>
                        <cell style="text-align:right;">504</cell>
                        <cell style="text-align:right;">0,021</cell>
                    </row>
                    <row>
                        <cell>n</cell>
                        <cell style="text-align:right;">1261</cell>
                        <cell style="text-align:right;">0,015</cell>
                        <cell style="text-align:right;">409</cell>
                        <cell style="text-align:right;">0,034</cell>
                        <cell style="text-align:right;">544</cell>
                        <cell style="text-align:right;">0,022</cell>
                        <cell style="text-align:right;">33</cell>
                        <cell style="text-align:right;">0,002</cell>
                        <cell style="text-align:right;">275</cell>
                        <cell style="text-align:right;">0,011</cell>
                    </row>
                    <row>
                        <cell>o</cell>
                        <cell style="text-align:right;">1753091</cell>
                        <cell style="text-align:right;">21,388</cell>
                        <cell style="text-align:right;">168126</cell>
                        <cell style="text-align:right;">13,836</cell>
                        <cell style="text-align:right;">671752</cell>
                        <cell style="text-align:right;">27,758</cell>
                        <cell style="text-align:right;">385687</cell>
                        <cell style="text-align:right;">18,010</cell>
                        <cell style="text-align:right;">527526</cell>
                        <cell style="text-align:right;">21,798</cell>
                    </row>
                    <row>
                        <cell>r</cell>
                        <cell style="text-align:right;">88021</cell>
                        <cell style="text-align:right;">1,074</cell>
                        <cell style="text-align:right;">1898</cell>
                        <cell style="text-align:right;">0,156</cell>
                        <cell style="text-align:right;">66250</cell>
                        <cell style="text-align:right;">2,738</cell>
                        <cell style="text-align:right;">19560</cell>
                        <cell style="text-align:right;">0,913</cell>
                        <cell style="text-align:right;">313</cell>
                        <cell style="text-align:right;">0,013</cell>
                    </row>
                    <row>
                        <cell>u</cell>
                        <cell style="text-align:right;">798943</cell>
                        <cell style="text-align:right;">9,747</cell>
                        <cell style="text-align:right;">178674</cell>
                        <cell style="text-align:right;">14,704</cell>
                        <cell style="text-align:right;">234301</cell>
                        <cell style="text-align:right;">9,682</cell>
                        <cell style="text-align:right;">155010</cell>
                        <cell style="text-align:right;">7,238</cell>
                        <cell style="text-align:right;">230958</cell>
                        <cell style="text-align:right;">9,544</cell>
                    </row>
                </table>
                <p>Based on the positional nucleus distribution data, it can be seen that overall
                    /a/ and /o/ constitute the most frequent nuclei in Serbian. However, there is
                    some positional variation. While the most frequent nuclei in final, medial, and
                    initial position of polysyllabic words are also /a/ and /o/, in monosyllabic
                    words, the most frequent nuclei are /a/ and /e/.</p>
            </div>
            <div>
                <head>Discussion</head>
                <p>While our mixed-principle rule-based syllabification algorithm is suitable for
                    the segmentation of words into syllables following the ruleset we devised based
                    by the combination of prescriptive rule descriptions and the employment of the
                    Sonority Sequencing Principle, there are still some practical and theoretical
                    considerations to be addressed.</p>
                <p>While reporting on the syllable distribution data, we mentioned that the
                    3,648,543 word-forms extracted from <hi rend="italic">SrpLemKor</hi> used for
                    the calculation of statistical data related to the distribution of syllables and
                    their structure in Serbian still contained some noise such as foreign words,
                    typos, and possibly random character strings. Based on 500 random samples taken
                    from the syllable output data checked by a human evaluator, the estimate of the
                    amount of such noise in the data is &lt;2%. Given the nature of corpus-based
                    data, this noise should not significantly impact the reliability of the
                    distributional information.</p>
                <p>From a theoretical standpoint, in formulating our algorithm, we disregarded the
                    three-syllable consonant cluster limitation put forward by <ref
                        target="#Kašić.2014">Kašić (2014)</ref> in favor of exploring the
                    limitations of the sonority module. The occurrence of the two syllable types
                    CCCCVC and CCCCV, which were not present in the onset-maximization-based
                    syllabification algorithm for Croatian (<ref target="#Meštrović.2015">Meštrović
                        et al. 2015</ref>), shows that in a limited number of instances this
                    constraint is needed to exclude syllable clusters that are in accordance with
                    the SSP and prescriptive rule descriptions, but seem contrary to native speaker
                    intuition about syllable boundaries. In addition to this, there is the ambiguity
                    in syllable segmentation in the case of continuant fricative phonemes (<ref
                        target="#Gvozdanović.2011">Gvozdanović 2011</ref>) with the continuant
                    constituting either the first place in the onset of the syllable or the last
                    place in the coda of the previous syllable, e.g. the possibility to syllabify
                        <hi rend="italic">postaviti</hi> [<hi rend="italic">to set</hi>] as <hi
                        rend="italic">po-sta-vi-ti</hi> and <hi rend="italic">pos-ta-vi-ti</hi>,
                    would require a larger-scale study examining the intuition of native speakers on
                    syllabification to make an assumption about contemporary tendencies in the
                    segmentation in these contexts.</p>
                <p>In order to verify the syllabic status of different clusters, it would be
                    interesting to conduct a series of monitoring studies modeled after <ref
                        target="#Mehler.1981">Mehler et al. (1981)</ref>, who have shown that
                    reaction times to a word are faster if the word is primed by a sequence
                    corresponding to a syllable in the word when compared to priming with a string
                    that does not constitute a syllable. <ref target="#Bradley.2007">Bradley et al.
                        (1993)</ref> argue that these effects produce mixed results in some
                    languages which contain a large number of ambisyllabic segments, so these
                    studies may also reveal whether and to what extent syllables play a role in
                    pre-lexical processing in Serbian.</p>
            </div>
            <div>
                <head>Conclusion</head>
                <p>In this paper we presented a mixed-principle rule-based syllabifier modelled
                    after the rule descriptions found in Stanojčić and <ref target="#Popović.2010"
                        >Popović (2005)</ref>, extended by rule specifications from <ref
                        target="#Kašić.2014">Kašić (2014)</ref> and <ref target="#Zec.2000">Zec
                        (2000)</ref>, and complemented by a sonority sequencing module based on <ref
                        target="#Selkirk.1984">Selkirk (1984)</ref>, <ref target="#Subotić.2012"
                        >Subotić et al. (2012)</ref>, and <ref target="#Zec.2000">Zec
                    (2000)</ref>.</p>
                <p>An implementation of the existing prescriptive rules for the segmentation of
                    words into syllables allowed us to gain an insight into the problem areas of the
                    rule descriptions, and propose a number of revisions and amendments to the
                    existing rules. The sonority sequencing module revealed the need for an
                    additional onset-length limitation constraint, and pointed out the limitations
                    of sonority in ambiguous consonant clusters that would require further
                    exploration and validation by native speaker intuition. We have also gained an
                    insight into the distribution of different syllable structures and syllable
                    nuclei following this approach, which will be useful for comparison with the
                    performance of alternative syllabification systems.</p>
                <p>In the future, we plan to compare our system to an onset-maximization-based
                    syllabifier for Serbian in combination with the prescriptive rules to see if we
                    can create an alternative system that will produce outputs consistent with the
                    intuition of native speakers of Serbian. It would be interesting to see a
                    systematic comparison of our current approach and the onset-maximization
                    approach with data gathered from the intuition of contemporary native speakers
                    of Serbian.</p>
                <p>We also believe that, while phonological criteria present a basis for
                    syllabification, in the future we will also need to test whether and to what
                    extent approaches based solely on phonological criteria result in syllable
                    boundaries that coincide with morphological boundaries. Our assumption is that
                    phonological rules will need to be amended by morphological criteria to result
                    in syllabification that respects morphological boundaries as well.</p>
                <p>In addition to these, the question of the treatment of foreign origin words and
                    transcribed foreign words might be an additional point to consider. As an
                    extension of a syllabifier, a language detection algorithm might be employed to
                    properly segment the former, while the latter might not need special treatment
                    as the process of transcription should in itself contain a degree of
                    phonological adaptation.</p>
            </div>
            <div>
                <head>Acknowledgment</head>
                <p>This research was supported by the Serbian Ministry of Education and Science
                    under the projects Development of Dialogue Systems for Serbian and Other South
                    Slavic Languages (TR-32035) and Languages and Cultures in Time and Space
                    (ON-178002).</p>
            </div>
        </body>
        <back>
            <div type="bibliography">
                <head>Sources and Literature</head>
                <listBibl>
                    <head>Literature:</head>
                    <bibl xml:id="Barber.2004">Barber, Horacio, Marta Vergara, and Manuel Carreiras.
                        2004. “Syllable-frequency effects in visual word recognition: evidence from
                        ERPs.” <hi rend="italic">Neuroreport</hi> 15 (3): 545–8.</bibl>
                    <bibl xml:id="Bradley.2007">Bradley, Dianne C., Rosa M. Sánchez-Casas, and José
                        E. García-Albea. 2007. “The status of the syllable in the perception of
                        Spanish and English.” <hi rend="italic">Language and Cognitive
                            Processes</hi> 8 (2): 197–233.</bibl>
                    <bibl xml:id="Bigi.2014">Bigi, Brigitte, and Caterina Petrone. 2014. “A generic
                        tool for the automatic syllabification of Italian.” In <hi rend="italic"
                            >Proceedings of The First Italian Conference on Computational
                            Linguistics, CLiC-it</hi>, 73–77. Pisa: Pisa University Press. <ref
                            target="http://siti.fileli.unipi.it/projects/clic/proceedings/Proceedings-CLICit-2014.pdf"
                            >http://siti.fileli.unipi.it/projects/clic/proceedings/Proceedings-CLICit-2014.pdf</ref>.</bibl>
                    <bibl xml:id="Butt.1992">Butt, Matthias. 1992. “Sonority and the Explanation of
                        Syllable Structure.” <hi rend="italic">Linguistische Berichte</hi> 137:
                        45–67.</bibl>
                    <bibl xml:id="Cholin.2006">Cholin, Joana, Willem J. M. Levelt, and Niels O.
                        Schiller. 2006. “Effects of syllable frequency in speech production.” <hi
                            rend="italic">Cognition</hi> 99 (2): 205–35.</bibl>
                    <bibl xml:id="Cholin.2009">Cholin, Joana, and Willem J. M. Levelt. 2009.
                        “Effects of syllable preparation and syllable frequency in speech
                        production: Further evidence for syllabic units at a post-lexical level.”
                            <hi rend="italic">Language and Cognitive Processes</hi> 24(5):
                        662–84.</bibl>
                    <bibl xml:id="Clements.1990">Clements, George N. 1990. “The Role of the Sonority
                        Cycle in Core Syllabification.” In <hi rend="italic">Papers in Laboratory
                            Phonology I: Between the Grammar and Physics of Speech</hi>, edited by
                        John Kingston, John and Mary E. Beckman, 282–333. Cambridge: Cambridge
                        University Press.</bibl>
                    <bibl xml:id="Daelemans.1992">Daelemans, Walter, and Antal van den Bosch. 1992.
                        “Generalization Performance of Backpropagation Learning on a Syllabification
                        Task.” In <hi rend="italic">Connectionism and Natural Language Processing:
                            Proceedings of the 3rd Twente Workshop on Language Technology,
                            TWLT3</hi>, 27–38. Enschede: University of Twente, Department of
                        Computer Science. <ref
                            target="https://pure.uvt.nl/portal/files/760578/generalization.pdf"
                            >https://pure.uvt.nl/portal/files/760578/generalization.pdf</ref>.</bibl>
                    <bibl xml:id="Foley.1972">Foley, James. 1972. “Rule Precursors and Phonological
                        Change by Meta-rule.” In <hi rend="italic">Linguistic change and generative
                            theory</hi>, edited by Robert P. Stockwell and Ronald K. S. Macaulay,
                        96–100. Bloomington: Indiana University Press.</bibl>
                    <bibl xml:id="Goldsmith.1995">Goldsmith, John A. 1995. <hi rend="italic">The
                            handbook of phonological theory</hi>. London: Blackwell
                        Publishers.</bibl>
                    <bibl xml:id="Gvozdanović.2011">Gvozdanović, Jadranka. 2011. “Phonological
                        domains.” In <hi rend="italic">Sandhi Phenomena in the Languages of
                            Europe,</hi> edited by Henning Andersen, 27–54. Berlin: Mouton de
                        Gruyter. </bibl>
                    <bibl xml:id="Hankamer.1974">Hankamer, Jorge, and Judith Aissen. 1974. “The
                        sonority hierarchy.” In <hi rend="italic">Papers from the Parasession on
                            Natural Phonology</hi>, edited by Anthony Bruck, Robert Allen Fox, and
                        Michael W. La Galy, 131–45. Chicago: Chicago Linguistic Society.</bibl>
                    <bibl xml:id="Hunt.1993">Hunt, Andrew. 1993. “Recurrent Neural Networks for
                        Syllabification.” <hi rend="italic">Speech Communication</hi> 13 (3–4):
                        323–32.</bibl>
                    <bibl xml:id="Iacoponi.2011">Iacoponi, Luca, and Renata Savy. 2011. “Sylli:
                        Automatic Phonological Syllabification for Italian.” In <hi rend="italic"
                            >INTERSPEECH 2011, 12th Annual Conference of the International Speech
                            Communication Association</hi>, 641–44. Florence: International Speech
                        Communication Association. <ref
                            target="http://eden.rutgers.edu/~li51/php/papers/interspeech2011.pdf"
                            >http://eden.rutgers.edu/~li51/php/papers/interspeech2011.pdf</ref>.</bibl>
                    <bibl xml:id="Kaplar.2018">Kaplar, Sebastijan, Marija Radojičić, Ivan Obradović,
                        Biljana Lazić, and Ranka Stanković. 2018. “Solution for quantitative
                        analysis of texts in Serbian based on syllables.” In <hi rend="italic">ICIST
                            2018 Proceedings</hi> 2, 315–20. Belgrade: Society for Information
                        Systems and Computer Networks. <ref
                            target="http://www.eventiotic.com/eventiotic/library/paper/429"
                            >http://www.eventiotic.com/eventiotic/library/paper/429</ref>.</bibl>
                    <bibl xml:id="Kašić.2014">Kašić, Zorka. 2014. “Opšta lingvistika 2
                        (Fonologija).” Lecture Materials, Faculty of Philosophy, University of
                        Belgrade.</bibl>
                    <bibl xml:id="Koehler.1966">Koehler, Klaus J. 1966. “Is the syllable a
                        phonological universal?” <hi rend="italic">Journal of Linguistics</hi> 2:
                        207–208.</bibl>
                    <bibl xml:id="Kovač.2018">Kovač, Aniko, and Maja Marković. 2018. “A Rule-Based
                        Syllabifier for Serbian.” In <hi rend="italic">Proceedings of the Conference
                            on Language Technologies and Digital Humanities 2018</hi>, 140–46.
                        Ljubljana: Ljubljana University Press.</bibl>
                    <bibl xml:id="Ladefoged.2014">Ladefoged, Peter, and Keith Johnson. 2014. <hi
                            rend="italic">A Course in Phonetics</hi>. Belmont: Wadsworth
                        Publishing.</bibl>
                    <bibl xml:id="Ladefoged.1982">Ladefoged, Peter. 1982. <hi rend="italic">A Course
                            in Phonetics</hi>. New York: Harcourt Brace Jovanovich.</bibl>
                    <bibl xml:id="Landsiedel.2011">Landsiedel, Christian, Jens Edlund, Florian
                        Eyben, Daniel Neiberg, and Björn Schuller. 2011. “Syllabification of
                        conversational speech using Bidirectional Long-Short-Term Memory Neural
                        Networks.” In<hi rend="italic"> 2011 IEEE International Conference on
                            Acoustics, Speech and Signal Processing (ICASSP)</hi>, 5256–9. Prague:
                        IEEE. <ref target="http://ieeexplore.ieee.org/abstract/document/5947543"
                            >http://ieeexplore.ieee.org/abstract/document/5947543</ref>.</bibl>
                    <bibl xml:id="Marchand.2009">Marchand, Yannick, Connie R. Adsett, and Robert I.
                        Damper. 2009. “Automatic syllabification in English: A comparison of
                        different algorithms.” <hi rend="italic">Language and Speech</hi> 52 (1):
                        1–27.</bibl>
                    <bibl xml:id="Mehler.1981">Mehler, Jacques, Jean Yves Dommergues, Uli
                        Frauenfelder, and Juan Segui. 1981. “The syllable's role in speech
                        segmentation.” <hi rend="italic">Journal of Verbal Learning and Verbal
                            Behavior</hi> 20 (3): 298–305.</bibl>
                    <bibl xml:id="Meštrović.2015">Meštrović, Ana, Sanda Martinčić-Ipšić, and Mihaela
                        Matešić. 2015. “Postupak automatskoga slogovanja temeljem načela najvećega
                        pristupa i statistika slogova za hrvatski jezik.” <hi rend="italic"
                            >Govor</hi>, 32: 3–34.</bibl>
                    <bibl xml:id="Morelli.1999">Morelli, Frida. 1999. “The phonotactics and
                        phonology of obstruent clusters in optimality theory.” PhD diss., University
                        of Maryland.</bibl>
                    <bibl xml:id="Ohala.1984">Ohala, John, and Haruko Kawasaki. 1984. “Prosodic
                        Phonology and Phonetics.” <hi rend="italic">Phonology Yearbook</hi>, 1:
                        113–27.</bibl>
                    <bibl xml:id="Ohala.1990">Ohala, John. 1990. “The Phonetics and Phonology of
                        Aspects of Assimilation.” In <hi rend="italic">Papers in Laboratory
                            Phonology I</hi>, edited by John Kingston, John and Mary E. Beckman,
                        258–75. Cambridge: Cambridge University Press.</bibl>
                    <bibl xml:id="Popović.2010">Popović, Zoran. 2010. “Taggers Applied on Texts in
                        Serbian.” <hi rend="italic">INFOtheca</hi> 11 (2): 21a–38a.</bibl>
                    <bibl xml:id="Selkirk.1984">Selkirk, Elisabeth O. 1984. “On the Major Class
                        Features and Syllable Theory.” In <hi rend="italic">Language Sound
                            Structure</hi>, edited by Mark Aronoff and Richard T. Oehrle, 107–36.
                        Cambridge: MIT Press.</bibl>
                    <bibl xml:id="Stanojčić.2005">Stanojčić, Živojin, and Ljubomir Popović. 2005.
                            <hi rend="italic">Gramatika srpskoga jezika</hi>. Belgrade: Zavod za
                        udžbenike i nastavna sredstva Beograd.</bibl>
                    <bibl xml:id="Stoianov.1997">Stoianov, Ivelin, John Nerbonne, and Huub Bouma.
                        1997. “Modelling the phonotactic structure of natural language words with
                        Simple Recurrent Networks.” In <hi rend="italic">Computational Linguistics
                            in the Netherlands 1997: Selected Papers from the Eight Clin
                            Meeting</hi>, 77–95. Amsterdam: Rodopi.</bibl>
                    <bibl xml:id="Subotić.2012">Subotić, Ljiljana, Dejan Sredojević, and Isidora
                        Bjelaković. 2012. <hi rend="italic">Fonetika i fonologija: Ortoepska i
                            ortografska norma standardnog srpskog jezika</hi>. Novi Sad: Filozofski
                        fakultet Univerziteta u Novom Sadu.</bibl>
                    <bibl xml:id="Utvić.2011">Utvić, Miloš. 2011. “Annotating the Corpus of
                        Contemporary Serbian.” <hi rend="italic">INFOtheca</hi> 12 (2):
                        36a–37a.</bibl>
                    <bibl xml:id="Zec.2000">Zec, Draga. 2000. “O strukturi sloga u srpskom jeziku.”
                            <hi rend="italic">Južnoslovenski filolog</hi> 56 (1-2): 435–48.</bibl>
                </listBibl>
            </div>
            <div type="summary">
                <docAuthor>Aniko Kovač, Maja Marković</docAuthor>
                <head style="text-transform: uppercase;">A mixed-principle rule-based approach to the automatic
                        syllabification of Serbian</head>
                <head rend="subheader" style="text-transform: uppercase;">SUMMARY</head>
                <p>In this paper we present a mixed-principle rule-based approach to the automatic
                    syllabification of Serbian based on prescriptive rule descriptions from
                    traditional grammar found in <ref target="#Stanojčić.2005">Stanojčić and Popović
                        (2005)</ref>, extended by rule specifications from <ref target="#Kašić.2014"
                        >Kašić (2014)</ref> and <ref target="#Zec.2000">Zec (2000)</ref>, and
                    complemented by a sonority sequencing module based on <ref
                        target="#Selkirk.1984">Selkirk (1984)</ref>, <ref target="#Subotić.2012"
                        >Subotić et al. (2012)</ref>, and <ref target="#Zec.2000">Zec
                    (2000)</ref>.</p>
                <p>Syllable segmentation plays a role in speech technologies – most notably in the
                    areas of speech recognition and text-to-speech synthesis – at both the segmental
                    and prosodic levels. It is also one of the governing factors in hyphenation, and
                    syllable frequency distribution data is used in psycholinguistic experiments as
                    a covariate. The unavailability of segmented data for Serbian makes a rule-based
                    approach to automatic syllabification the only viable option as there is no data
                    available for training a data-driven neural network model, and the segmentation
                    of large-scale language corpora by trained annotators would be a resource and
                    cost heavy undertaking. </p>
                <p>Our goal in this paper is threefold: i) we extend and improve an earlier version
                    of our syllabification algorithm by introducing a sonority sequencing validation
                    module which resolves a number of issues present in the earlier version of our
                    syllabifier, ii) we provide a detailed analysis of the outcomes of the automatic
                    syllabification process in order to address possible theoretical considerations
                    and serve as a basis for the development of future syllabifiers, and iii) we
                    present the statistical data related to the distribution of syllables and their
                    structure in Serbian to be used in psycholinguistic experiments.</p>
                <p>The implementation of the existing set of prescriptive rules for the
                    segmentation of words into syllables in Serbian allowed us to gain an insight
                    into problem areas of the rule descriptions, and propose a number of revisions
                    and amendments to the existing rules. The sonority sequencing module revealed
                    the need for an additional onset-length limitation constraint, and pointed out
                    the limitations of sonority in ambiguous consonant clusters – such is the case
                    with continuant fricative phonemes that seem to be able to occupy either the
                    first place in the onset of a syllable or the last place in the coda of a
                    previous syllable – that would require further exploration and validation by
                    native speaker intuition. </p>
                <p>The data on the distribution of different syllable structures and syllable
                    nuclei following this approach will be useful for comparison with the
                    performance of alternative syllabification systems. In the future, it would be
                    interesting to see a systematic comparison of our current approach to
                    alternative approaches such as an onset-maximization approach evaluated on
                    segmentation data gathered from the native speakers of Serbian.</p>
            </div>
            <div type="summary" xml:lang="sl">
                <docAuthor>Aniko Kovač, Maja Marković</docAuthor>
                <head style="text-transform: uppercase;">Mešani pristop k avtomatskemu zlogovanju v srbščini na
                        podlagi načel in pravil</head>
                <head rend="subheader" style="text-transform: uppercase;">Povzetek</head>
                <p>V tem prispevku predstavljamo mešani pristop k avtomatskemu zlogovanju v
                    srbščini na podlagi načel in pravil, ki temelji na opisih predpisnih pravil
                    tradicionalne slovnice (kot jih navajata <ref target="#Stanojčić.2005">Stanojčić
                        in Popović 2005</ref>), razširjenih z opredelitvami pravil (kot jih navajata
                        <ref target="#Kašić.2014">Kašić (2014)</ref> in <ref target="#Zec.2000">Zec
                        (2000)</ref>) in dopolnjenih z modulom za zaporedje glede na zvočnost (na
                    podlagi del avtorjev <ref target="#Selkirk.1984">Selkirk 1984</ref>; <ref
                        target="#Subotić.2012">Subotić et al. 2012</ref>; <ref target="#Zec.2000"
                        >Zec 2000</ref>).</p>
                <p>Členitev na zloge ima pomembno vlogo v govornih tehnologijah – zlasti na
                    področjih prepoznavanja govora in pretvorbe besedila v govor – na segmentalni in
                    prozodični ravni. Je tudi eden od vodilnih dejavnikov pri deljenju besed.
                    Podatki o frekvenčni porazdelitvi zlogov se uporabljajo v psiholingvističnih
                    poskusih kot sočasna spremenljivka. Pristop k avtomatskemu zlogovanju, ki
                    temelji na pravilih, je edina smiselna izbira, saj za srbščino ni na voljo
                    segmentiranih podatkov, iz katerih bi se model nevronske mreže lahko učil.
                    Projekt, pri katerem bi usposobljeni komentatorji razčlenjevali obsežne
                    jezikovne korupse, pa bi bil zelo zahteven in drag. </p>
                <p>Naš prispevek ima tri cilje: i) razširiti in izboljšati predhodno različico
                    našega algoritma za zlogovanje z vpeljavo modula za potrjevanje zaporedja glede
                    na zvočnost, ki odpravlja vrsto težav iz predhodne različice našega
                    zlogovalnika; ii) predstaviti podrobno analizo rezultatov avtomatskega postopka
                    zlogovanja, da bi spodbudili morebitne teoretične razmisleke in zagotovili
                    podlago za razvoj prihodnjih zlogovalnikov; in iii) predstaviti statistične
                    podatke, povezane s porazdelitvijo in strukturo zlogov v srbščini, ki jih bo
                    mogoče uporabiti pri psiholingivstičnih poskusih. </p>
                <p>Uporaba uveljavljene zbirke predpisnih pravil za členitev besed na zloge v
                    srbščini nam je omogočila, da smo dobili podroben vpogled v težavna področja pri
                    opisih pravil in predlagali vrsto sprememb in popravkov uveljavljenih pravil.
                    Modul za zaporedje glede na zvočnost je razkril potrebo po dodatni omejitvi
                    dolžine vzglasja in izpostavil omejitve zvočnosti pri dvoumnih soglasniških
                    sklopih (na primer priporniki, ki očitno lahko zavzemajo prvo mesto na začetku
                    zloga ali zadnje mesto na koncu predhodnega zloga), ki bi jih bilo treba dodatno
                    raziskati in potrditi s pomočjo intuicije rojenega govorca.</p>
                <p>Podatke o porazdelitvi različnih zlogovnih struktur in jeder, pridobljene s tem
                    pristopom, bo mogoče uporabiti za primerjavo z delovanjem drugih sistemov za
                    zlogovanje. Zanimivo bi bilo opraviti sistematično primerjavo našega pristopa z
                    drugimi pristopi, na primer pristopom maksimizacije vzglasja, ovrednotenim na
                    podlagi podatkov o členitvi, pridobljenih od rojenih govorcev srbščine.</p>
            </div>
        </back>

    </text>
</TEI>
