An error occurred while loading the file. Please try again.
-
Anna CHEPAIKINA authored
Update modules/config-tomap.plan, modules/preprocess.plan, train-tomap.plan, modules/patterns/find-themes.plan, modules/patterns/eicchorn-lorenz.plan, modules/patterns/bbch.plan, modules/patterns/baggiolini.plan, modules/projectors.plan, modules/export.plan
e75e997c
<?xml version="1.0" encoding="UTF-8"?>
<alvisnlp-plan id="projectors">
<!--
///////////////////////////////////////////////////////////////////////////////////////
//
// I. Baseline approach
//
///////////////////////////////////////////////////////////////////////////////////////
-->
<baseline>
<fcu>
<!-- Project concepts on text -->
<project class="RDFProjector">
<!-- project concepts on the lemmas from the corpus -->
<source>resources/thesaurus/fcu/frenchCropUsage_20210817.rdf</source>
<subject layer="words" feature="lemma"/>
<!-- save only owl:individuals with their iri and french labels-->
<resourceTypeURIs>owl:NamedIndividual</resourceTypeURIs>
<language>fr</language>
<uriFeatureName>uri</uriFeatureName>
<!-- parameters of mapping -->
<allowJoined>true</allowJoined>
<joinDash>true</joinDash>
<caseInsensitive>true</caseInsensitive>
<!-- place annotations to a layer -->
<targetLayerName>fcu-baseline</targetLayerName>
<constantAnnotationFeatures>type=RDFProjector</constantAnnotationFeatures>
</project>
<filter>
<!-- filter some of ambiguous words -->
<ambiguous-words class="Action">
<target>documents.sections.layer:fcu-baseline[
@lemma == "orange"
or @lemma== "marron"
or @lemma == "fruit"
or @lemma == "semence"
or @lemma == "côte"
or @lemma == "soleil"
or @lemma == "gel"
or @lemma == "fleur"]</target>
<action>remove:fcu-baseline</action>
<removeFromLayer/>
</ambiguous-words>
<!-- filter overlapping annotations -->
<overlaps class="RemoveOverlaps">
<layerName>fcu-baseline</layerName>
<removeEqual/>
<removeIncluded>true</removeIncluded>
<removeOverlapping>true</removeOverlapping>
</overlaps>
</filter>
<export>
<annotations href="modules/export.plan">
<!-- focus on the annotations made by rdf-projector -->
<outLayerName>documents.sections.(layer:fcu-baseline)</outLayerName>
<!-- save to a tabular file-->
<outFile>fcu-baseline-annotations.csv</outFile>
</annotations>
<most-prominent-words>
<!-- calculate tf-idf score with standard parameters -->
<tfidf-score class="KeywordsSelector">
<!-- input terms -->
<keywords>sections.layer:fcu-baseline</keywords>
7172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140
<keywordForm>@skos-prefLabel</keywordForm>
<!-- parameters -->
<scoreFunction>tfidf</scoreFunction>
<scoreThreshold>-1000</scoreThreshold>
<!-- save -->
<outFile>output/annotations/fcu-baseline-tfidf.csv</outFile>
</tfidf-score>
<!-- calculate bm25 score with standard parameters -->
<bm25-score class="KeywordsSelector">
<!-- input terms -->
<keywords>sections.layer:fcu-baseline</keywords>
<keywordForm>@skos-prefLabel</keywordForm>
<documentId>document.@id</documentId>
<!-- parameters -->
<scoreFunction type="bm25" k1="1.2" b="0.75"/>
<scoreThreshold>-1000</scoreThreshold>
<!-- save -->
<outFile>output/annotations/fcu-baseline-bm25.csv</outFile>
</bm25-score>
</most-prominent-words>
</export>
</fcu>
<ppdo>
<!-- Project concepts on text -->
<project class="RDFProjector">
<!-- project concepts on the lemmas from the corpus -->
<source>resources/thesaurus/ppdo/ppdo_20210726.rdf</source>
<subject layer="words" feature="lemma"/>
<!-- save only owl:individuals with their iri and french labels-->
<resourceTypeURIs>owl:NamedIndividual</resourceTypeURIs>
<language>fr</language>
<uriFeatureName>uri</uriFeatureName>
<!-- parameters of mapping -->
<allowJoined>true</allowJoined>
<joinDash>true</joinDash>
<caseInsensitive>true</caseInsensitive>
<!-- place annotations to a layer -->
<targetLayerName>ppdo-baseline</targetLayerName>
<constantAnnotationFeatures>type=RDFProjector</constantAnnotationFeatures>
</project>
<find-patterns>
<bbch>
<patterns href="modules/patterns/bbch.plan"/>
<export href="modules/export.plan">
<!-- focus on the matches made with patterns-->
<outLayerName>documents.sections.(layer:bbch)</outLayerName>
<!-- save to a tabular file-->
<outFile>ppdo-baseline-bbch.csv</outFile>
</export>
<align class="RDFProjector">
<!-- create a correspondance between matches and real concepts from ppdo-->
<source>resources/thesaurus/ppdo/ppdo_20210726.rdf</source>
<subject layer="bbch" feature="canonical-form"/>
<targetLayerName>ppdo-baseline</targetLayerName>
<!-- information that has to be saved and added to the baseline layer-->
<resourceTypeURIs>owl:NamedIndividual</resourceTypeURIs>
<language>fr</language>
<uriFeatureName>uri</uriFeatureName>
<!-- parameters -->
<wordStartCaseInsensitive/>
<allowJoined/>
<constantAnnotationFeatures>type=PATTERN_BBCH
</constantAnnotationFeatures>
</align>
141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210
</bbch>
<baggiolini>
<patterns href="modules/patterns/baggiolini.plan"/>
<export href="modules/export.plan">
<!-- save to a tabular file-->
<outLayerName>documents.sections.(layer:baggiolini)</outLayerName>
<!-- focus on the matches made with patterns-->
<outFile>ppdo-baseline-baggiolini.csv</outFile>
</export>
<align class="RDFProjector">
<!-- create a correspondance between matches and real concepts from ppdo-->
<source>resources/thesaurus/ppdo/ppdo_20210726.rdf</source>
<subject layer="baggiolini" feature="canonical-form"/>
<targetLayerName>ppdo-baseline</targetLayerName>
<!-- information that has to be saved and added to the baseline layer-->
<resourceTypeURIs>owl:NamedIndividual</resourceTypeURIs>
<language>fr</language>
<uriFeatureName>uri</uriFeatureName>
<!-- parameters -->
<constantAnnotationFeatures>type=PATTERN_BAGGIOLINI
</constantAnnotationFeatures>
</align>
</baggiolini>
<eicchorn-lorenz>
<patterns href="modules/patterns/eicchorn-lorenz.plan"/>
<export href="modules/export.plan">
<outFile>ppdo-baseline-eicchorn-lorenz.csv</outFile>
<outLayerName>documents.sections.(layer:eicchorn-lorenz)</outLayerName>
</export>
<align class="RDFProjector">
<!-- create a correspondance between matches and real concepts from ppdo-->
<source>resources/thesaurus/ppdo/ppdo_20210726.rdf</source>
<subject layer="eicchorn-lorenz" feature="canonical-form"/>
<targetLayerName>ppdo-baseline</targetLayerName>
<!-- information that has to be saved and added to the baseline layer-->
<resourceTypeURIs>owl:NamedIndividual</resourceTypeURIs>
<language>fr</language>
<uriFeatureName>uri</uriFeatureName>
<!-- parameters -->
<constantAnnotationFeatures>type=PATTERN_EICCHORN_LORENZ
</constantAnnotationFeatures>
</align>
</eicchorn-lorenz>
</find-patterns>
<!-- filter overlapping annotations form baseline layer -->
<filter>
<overlaps class="RemoveOverlaps">
<layerName>ppdo-baseline</layerName>
<removeEqual/>
<removeIncluded>false</removeIncluded>
<removeOverlapping>true</removeOverlapping>
</overlaps>
</filter>
<!-- save ppdo stages to a tabular file -->
<export href="modules/export.plan">
<outLayerName>documents.sections.(layer:ppdo-baseline)</outLayerName>
<outFile>ppdo-baseline-annotations.csv</outFile>
</export>
</ppdo>
<others>
<wine-bioagressors class="TabularProjector">
<!-- project labels of biaogressors on the lemmas from the corpus -->
<dictFile>resources/others/wine_bioagressors.txt</dictFile>
<subject layer="words" feature="lemma"/>
<!-- parameters of mapping -->
211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280
<valueFeatures>$</valueFeatures>
<allowJoined/>
<caseInsensitive/>
<ignoreDiacritics/>
<wordStartCaseInsensitive/>
<skipWhitespace/>
<!-- place annotations to a layer -->
<targetLayerName>bioagressors</targetLayerName>
<constantAnnotationFeatures>type=BIOAGRESSOR</constantAnnotationFeatures>
</wine-bioagressors>
<locality>
<regions>
<new class="TabularProjector">
<!-- project labels of french new regions on the lemmas from the corpus -->
<dictFile>resources/others/regions.txt</dictFile>
<subject layer="words" feature="lemma"/>
<!-- parameters of mapping -->
<valueFeatures>$</valueFeatures>
<allowJoined/>
<caseInsensitive/>
<ignoreDiacritics/>
<skipWhitespace/>
<!-- place annotations to a layer -->
<targetLayerName>locality</targetLayerName>
<constantAnnotationFeatures>type=NEW REGION</constantAnnotationFeatures>
</new>
<old class="TabularProjector">
<!-- project labels of french old regions on the lemmas from the corpus -->
<dictFile>resources/others/regions.txt</dictFile>
<subject layer="words" feature="lemma"/>
<!-- parameters of mapping -->
<valueFeatures>$</valueFeatures>
<allowJoined/>
<caseInsensitive/>
<ignoreDiacritics/>
<skipWhitespace/>
<!-- place annotations to a layer -->
<targetLayerName>locality</targetLayerName>
<constantAnnotationFeatures>type=NEW REGION</constantAnnotationFeatures>
</old>
</regions>
<departments class="TabularProjector">
<!-- project labels of french departments on the lemmas from the corpus -->
<dictFile>resources/others/departements.txt</dictFile>
<subject layer="words" feature="lemma"/>
<!-- parameters of mapping -->
<valueFeatures>$</valueFeatures>
<allowJoined/>
<caseInsensitive/>
<ignoreDiacritics/>
<skipWhitespace/>
<!-- place annotations to a layer -->
<targetLayerName>locality</targetLayerName>
<constantAnnotationFeatures>type=DEPARTMENT</constantAnnotationFeatures>
</departments>
<communes class="TabularProjector">
<!-- project labels of french communes on the lemmas from the corpus -->
<dictFile>resources/others/communes.txt</dictFile>
<subject layer="words" feature="lemma"/>
<!-- parameters of mapping -->
<valueFeatures>$</valueFeatures>
<allowJoined/>
<ignoreDiacritics/>
<skipWhitespace/>
<!-- place annotations to a layer -->
281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350
<targetLayerName>locality</targetLayerName>
<constantAnnotationFeatures>type=COMMUNE</constantAnnotationFeatures>
</communes>
</locality>
</others>
</baseline>
<!--
///////////////////////////////////////////////////////////////////////////////////////
//
// II. ToMap approach
//
///////////////////////////////////////////////////////////////////////////////////////
-->
<to-map>
<config-tomap href="modules/config-tomap.plan"/>
<fcu>
<project>
<!-- find tomap candidates -->
<classify class="TomapProjector">
<!-- classify lemmas of candidates similar to those from the layer words (also lemmas) -->
<subject layer="words" feature="lemma"/>
<tomapClassifier graylist="resources/tomap/tomap-graylist.txt" >resources/tomap/frenchCropUsage_20210525.tomap</tomapClassifier>
<lemmaKeys/>
<!-- save those candidates and their score of similarity to a yatea file -->
<yateaFile output-feed="true">../yatea.xml</yateaFile>
<scoreFeature>similarity</scoreFeature>
<!--save to a layer with some additional information -->
<conceptFeature>IRI</conceptFeature>
<explanationFeaturePrefix>tomap-</explanationFeaturePrefix>
<targetLayerName>fcu-tomap</targetLayerName>
</classify>
<!-- map preflabels to labels of candidates -->
<map-label class="FileMapper">
<!--take into account a tabular file with columns : any label, iri , preflabel-->
<mappingFile>resources/tomap/frenchCropUsage_20210525.txt</mappingFile>
<keyColumn>1</keyColumn>
<targetFeatures>,,skos-prefLabel</targetFeatures>
<target>documents.sections.layer:fcu-tomap</target>
<form>@IRI</form>
</map-label>
</project>
<filter>
<ambiguous-words class="Action">
<target>documents.sections.layer:fcu-tomap[
@lemma == "orange" or
@lemma== "marron" or
@lemma == "fruit" or
@lemma == "semence" or
@lemma == "côte" or
@lemma == "soleil" or
@lemma == "gel" or
@lemma == "fleur"]</target>
<action>remove:fcu-tomap</action>
<removeFromLayer/>
</ambiguous-words>
<!--
<remove-not-similar-matches class="Action">
<target>documents.sections.layer:fcu-tomap[0.4 > @similarity ]</target>
<action>delete</action>
<deleteElements>true</deleteElements>
</remove-not-similar-matches>
-->
<!-- filter overlapping annotations -->
351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420
<overlaps class="RemoveOverlaps">
<layerName>fcu-tomap</layerName>
<removeEqual/>
<removeIncluded>true</removeIncluded>
<removeOverlapping>true</removeOverlapping>
</overlaps>
</filter>
<export>
<annotations href="modules/export.plan">
<outFile>fcu-tomap-annotations.csv</outFile>
<outLayerName>documents.sections.(layer:fcu-tomap)</outLayerName>
</annotations>
<most-prominent-words>
<!-- calculate tf-idf score with standard parameters -->
<tfidf-score class="KeywordsSelector">
<!-- input terms -->
<keywords>sections.layer:fcu-tomap</keywords>
<keywordForm>@skos-prefLabel</keywordForm>
<!-- parameters -->
<scoreFunction>tfidf</scoreFunction>
<scoreThreshold>-1000</scoreThreshold>
<!-- save -->
<outFile>output/annotations/fcu-tomap-tfidf.csv</outFile>
</tfidf-score>
<!-- calculate bm25 score with standard parameters -->
<bm25-score class="KeywordsSelector">
<!-- input terms -->
<keywords>sections.layer:fcu-tomap</keywords>
<keywordForm>@skos-prefLabel</keywordForm>
<documentId>document.@id</documentId>
<!-- parameters -->
<scoreFunction type="bm25" k1="1.2" b="0.75"/>
<scoreThreshold>-1000</scoreThreshold>
<!-- save -->
<outFile>output/annotations/fcu-tomap-bm25.csv</outFile>
</bm25-score>
</most-prominent-words>
</export>
</fcu>
<ppdo>
<project>
<!-- find tomap candidates -->
<classify class="TomapProjector">
<!-- classify lemmas of candidates similar to those from the layer words (also lemmas) -->
<subject layer="words" feature="lemma"/>
<tomapClassifier graylist="resources/tomap/tomap-graylist.txt" >resources/tomap/ppdo_20210726.tomap</tomapClassifier>
<lemmaKeys/>
<!-- save those candidates and their score of similarity to yatea file -->
<yateaFile output-feed="true">../yatea.xml</yateaFile>
<scoreFeature>similarity</scoreFeature>
<!--save to a layer with some additional information -->
<conceptFeature>IRI</conceptFeature>
<explanationFeaturePrefix>tomap-</explanationFeaturePrefix>
<targetLayerName>ppdo-tomap</targetLayerName>
</classify>
<!-- map preflabels to labels of candidates -->
<map-label class="FileMapper">
<!--take into account a tabular file with columns : any label, iri , preflabel-->
<mappingFile>resources/tomap/ppdo_20210726.txt</mappingFile>
<keyColumn>1</keyColumn>
<targetFeatures>,,skos-prefLabel</targetFeatures>
<target>documents.sections.layer:ppdo-tomap</target>
<form>@IRI</form>
</map-label>
</project>
421422423424425426427428429430431432433434435436437438439440441442443444
<filter>
<!-- filter overlapping annotations -->
<overlaps class="RemoveOverlaps">
<layerName>ppdo-tomap</layerName>
<removeEqual/>
<removeIncluded>true</removeIncluded>
<removeOverlapping>true</removeOverlapping>
</overlaps>
</filter>
<export>
<annotations href="modules/export.plan">
<outFile>ppdo-tomap-annotations.csv</outFile>
<outLayerName>documents.sections.(layer:ppdo-tomap)</outLayerName>
</annotations>
</export>
</ppdo>
</to-map>
</alvisnlp-plan>