projectors.plan 16.08 KiB
<?xml version="1.0" encoding="UTF-8"?>
<alvisnlp-plan id="projectors">
<!--
///////////////////////////////////////////////////////////////////////////////////////
//
//  I. Baseline approach
//
///////////////////////////////////////////////////////////////////////////////////////
-->
<baseline>
	<fcu>
		<!-- Project concepts on text -->
  		<project class="RDFProjector">
    		<!-- project concepts on the lemmas from the corpus -->
    		<source>resources/thesaurus/fcu/frenchCropUsage_20210817.rdf</source>
    		<subject layer="words" feature="lemma"/>   		
    		<!-- save only owl:individuals with their iri and french labels-->  
    		<resourceTypeURIs>owl:NamedIndividual</resourceTypeURIs>
    		<language>fr</language>
    		<uriFeatureName>uri</uriFeatureName>   		   		
    		<!-- parameters of mapping -->
    		<allowJoined>true</allowJoined>
    		<joinDash>true</joinDash>
    		<caseInsensitive>true</caseInsensitive>
    		<!-- place annotations to a layer -->
    		<targetLayerName>fcu-baseline</targetLayerName>
    		<constantAnnotationFeatures>type=RDFProjector</constantAnnotationFeatures>		
  		</project>		
		<filter>
			<!-- filter some of ambiguous words -->
    		<ambiguous-words class="Action">
      			<target>documents.sections.layer:fcu-baseline[
        		@lemma == "orange"
        		or @lemma== "marron"
        		or @lemma == "fruit"
        		or @lemma == "semence"
        		or @lemma == "côte"
        		or @lemma == "soleil"
        		or @lemma == "gel"
        		or @lemma == "fleur"]</target>
     	 		<action>remove:fcu-baseline</action>
      			<removeFromLayer/>
    		</ambiguous-words>
    		<!-- filter overlapping annotations -->
    		<overlaps class="RemoveOverlaps">
    			<layerName>fcu-baseline</layerName>
    			<removeEqual/>
    			<removeIncluded>true</removeIncluded>
   				<removeOverlapping>true</removeOverlapping>
  			</overlaps>
		</filter>
		<export>		
			<annotations href="modules/export.plan">
				<!-- focus on the annotations made by rdf-projector -->
				<outLayerName>documents.sections.(layer:fcu-baseline)</outLayerName>
				<!-- save to a tabular file-->
				<outFile>fcu-baseline-annotations.csv</outFile>		
			</annotations>
			<most-prominent-words>
				<!-- calculate tf-idf score with standard parameters -->
  				<tfidf-score class="KeywordsSelector">
  					<!-- input terms -->
  					<keywords>sections.layer:fcu-baseline</keywords>
7172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140
<keywordForm>@skos-prefLabel</keywordForm> <!-- parameters --> <scoreFunction>tfidf</scoreFunction> <scoreThreshold>-1000</scoreThreshold> <!-- save --> <outFile>output/annotations/fcu-baseline-tfidf.csv</outFile> </tfidf-score> <!-- calculate bm25 score with standard parameters --> <bm25-score class="KeywordsSelector"> <!-- input terms --> <keywords>sections.layer:fcu-baseline</keywords> <keywordForm>@skos-prefLabel</keywordForm> <documentId>document.@id</documentId> <!-- parameters --> <scoreFunction type="bm25" k1="1.2" b="0.75"/> <scoreThreshold>-1000</scoreThreshold> <!-- save --> <outFile>output/annotations/fcu-baseline-bm25.csv</outFile> </bm25-score> </most-prominent-words> </export> </fcu> <ppdo> <!-- Project concepts on text --> <project class="RDFProjector"> <!-- project concepts on the lemmas from the corpus --> <source>resources/thesaurus/ppdo/ppdo_20210726.rdf</source> <subject layer="words" feature="lemma"/> <!-- save only owl:individuals with their iri and french labels--> <resourceTypeURIs>owl:NamedIndividual</resourceTypeURIs> <language>fr</language> <uriFeatureName>uri</uriFeatureName> <!-- parameters of mapping --> <allowJoined>true</allowJoined> <joinDash>true</joinDash> <caseInsensitive>true</caseInsensitive> <!-- place annotations to a layer --> <targetLayerName>ppdo-baseline</targetLayerName> <constantAnnotationFeatures>type=RDFProjector</constantAnnotationFeatures> </project> <find-patterns> <bbch> <patterns href="modules/patterns/bbch.plan"/> <export href="modules/export.plan"> <!-- focus on the matches made with patterns--> <outLayerName>documents.sections.(layer:bbch)</outLayerName> <!-- save to a tabular file--> <outFile>ppdo-baseline-bbch.csv</outFile> </export> <align class="RDFProjector"> <!-- create a correspondance between matches and real concepts from ppdo--> <source>resources/thesaurus/ppdo/ppdo_20210726.rdf</source> <subject layer="bbch" feature="canonical-form"/> <targetLayerName>ppdo-baseline</targetLayerName> <!-- information that has to be saved and added to the baseline layer--> <resourceTypeURIs>owl:NamedIndividual</resourceTypeURIs> <language>fr</language> <uriFeatureName>uri</uriFeatureName> <!-- parameters --> <wordStartCaseInsensitive/> <allowJoined/> <constantAnnotationFeatures>type=PATTERN_BBCH </constantAnnotationFeatures> </align>
141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210
</bbch> <baggiolini> <patterns href="modules/patterns/baggiolini.plan"/> <export href="modules/export.plan"> <!-- save to a tabular file--> <outLayerName>documents.sections.(layer:baggiolini)</outLayerName> <!-- focus on the matches made with patterns--> <outFile>ppdo-baseline-baggiolini.csv</outFile> </export> <align class="RDFProjector"> <!-- create a correspondance between matches and real concepts from ppdo--> <source>resources/thesaurus/ppdo/ppdo_20210726.rdf</source> <subject layer="baggiolini" feature="canonical-form"/> <targetLayerName>ppdo-baseline</targetLayerName> <!-- information that has to be saved and added to the baseline layer--> <resourceTypeURIs>owl:NamedIndividual</resourceTypeURIs> <language>fr</language> <uriFeatureName>uri</uriFeatureName> <!-- parameters --> <constantAnnotationFeatures>type=PATTERN_BAGGIOLINI </constantAnnotationFeatures> </align> </baggiolini> <eicchorn-lorenz> <patterns href="modules/patterns/eicchorn-lorenz.plan"/> <export href="modules/export.plan"> <outFile>ppdo-baseline-eicchorn-lorenz.csv</outFile> <outLayerName>documents.sections.(layer:eicchorn-lorenz)</outLayerName> </export> <align class="RDFProjector"> <!-- create a correspondance between matches and real concepts from ppdo--> <source>resources/thesaurus/ppdo/ppdo_20210726.rdf</source> <subject layer="eicchorn-lorenz" feature="canonical-form"/> <targetLayerName>ppdo-baseline</targetLayerName> <!-- information that has to be saved and added to the baseline layer--> <resourceTypeURIs>owl:NamedIndividual</resourceTypeURIs> <language>fr</language> <uriFeatureName>uri</uriFeatureName> <!-- parameters --> <constantAnnotationFeatures>type=PATTERN_EICCHORN_LORENZ </constantAnnotationFeatures> </align> </eicchorn-lorenz> </find-patterns> <!-- filter overlapping annotations form baseline layer --> <filter> <overlaps class="RemoveOverlaps"> <layerName>ppdo-baseline</layerName> <removeEqual/> <removeIncluded>false</removeIncluded> <removeOverlapping>true</removeOverlapping> </overlaps> </filter> <!-- save ppdo stages to a tabular file --> <export href="modules/export.plan"> <outLayerName>documents.sections.(layer:ppdo-baseline)</outLayerName> <outFile>ppdo-baseline-annotations.csv</outFile> </export> </ppdo> <others> <wine-bioagressors class="TabularProjector"> <!-- project labels of biaogressors on the lemmas from the corpus --> <dictFile>resources/others/wine_bioagressors.txt</dictFile> <subject layer="words" feature="lemma"/> <!-- parameters of mapping -->
211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280
<valueFeatures>$</valueFeatures> <allowJoined/> <caseInsensitive/> <ignoreDiacritics/> <wordStartCaseInsensitive/> <skipWhitespace/> <!-- place annotations to a layer --> <targetLayerName>bioagressors</targetLayerName> <constantAnnotationFeatures>type=BIOAGRESSOR</constantAnnotationFeatures> </wine-bioagressors> <locality> <regions> <new class="TabularProjector"> <!-- project labels of french new regions on the lemmas from the corpus --> <dictFile>resources/others/regions.txt</dictFile> <subject layer="words" feature="lemma"/> <!-- parameters of mapping --> <valueFeatures>$</valueFeatures> <allowJoined/> <caseInsensitive/> <ignoreDiacritics/> <skipWhitespace/> <!-- place annotations to a layer --> <targetLayerName>locality</targetLayerName> <constantAnnotationFeatures>type=NEW REGION</constantAnnotationFeatures> </new> <old class="TabularProjector"> <!-- project labels of french old regions on the lemmas from the corpus --> <dictFile>resources/others/regions.txt</dictFile> <subject layer="words" feature="lemma"/> <!-- parameters of mapping --> <valueFeatures>$</valueFeatures> <allowJoined/> <caseInsensitive/> <ignoreDiacritics/> <skipWhitespace/> <!-- place annotations to a layer --> <targetLayerName>locality</targetLayerName> <constantAnnotationFeatures>type=NEW REGION</constantAnnotationFeatures> </old> </regions> <departments class="TabularProjector"> <!-- project labels of french departments on the lemmas from the corpus --> <dictFile>resources/others/departements.txt</dictFile> <subject layer="words" feature="lemma"/> <!-- parameters of mapping --> <valueFeatures>$</valueFeatures> <allowJoined/> <caseInsensitive/> <ignoreDiacritics/> <skipWhitespace/> <!-- place annotations to a layer --> <targetLayerName>locality</targetLayerName> <constantAnnotationFeatures>type=DEPARTMENT</constantAnnotationFeatures> </departments> <communes class="TabularProjector"> <!-- project labels of french communes on the lemmas from the corpus --> <dictFile>resources/others/communes.txt</dictFile> <subject layer="words" feature="lemma"/> <!-- parameters of mapping --> <valueFeatures>$</valueFeatures> <allowJoined/> <ignoreDiacritics/> <skipWhitespace/> <!-- place annotations to a layer -->
281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350
<targetLayerName>locality</targetLayerName> <constantAnnotationFeatures>type=COMMUNE</constantAnnotationFeatures> </communes> </locality> </others> </baseline> <!-- /////////////////////////////////////////////////////////////////////////////////////// // // II. ToMap approach // /////////////////////////////////////////////////////////////////////////////////////// --> <to-map> <config-tomap href="modules/config-tomap.plan"/> <fcu> <project> <!-- find tomap candidates --> <classify class="TomapProjector"> <!-- classify lemmas of candidates similar to those from the layer words (also lemmas) --> <subject layer="words" feature="lemma"/> <tomapClassifier graylist="resources/tomap/tomap-graylist.txt" >resources/tomap/frenchCropUsage_20210525.tomap</tomapClassifier> <lemmaKeys/> <!-- save those candidates and their score of similarity to a yatea file --> <yateaFile output-feed="true">../yatea.xml</yateaFile> <scoreFeature>similarity</scoreFeature> <!--save to a layer with some additional information --> <conceptFeature>IRI</conceptFeature> <explanationFeaturePrefix>tomap-</explanationFeaturePrefix> <targetLayerName>fcu-tomap</targetLayerName> </classify> <!-- map preflabels to labels of candidates --> <map-label class="FileMapper"> <!--take into account a tabular file with columns : any label, iri , preflabel--> <mappingFile>resources/tomap/frenchCropUsage_20210525.txt</mappingFile> <keyColumn>1</keyColumn> <targetFeatures>,,skos-prefLabel</targetFeatures> <target>documents.sections.layer:fcu-tomap</target> <form>@IRI</form> </map-label> </project> <filter> <ambiguous-words class="Action"> <target>documents.sections.layer:fcu-tomap[ @lemma == "orange" or @lemma== "marron" or @lemma == "fruit" or @lemma == "semence" or @lemma == "côte" or @lemma == "soleil" or @lemma == "gel" or @lemma == "fleur"]</target> <action>remove:fcu-tomap</action> <removeFromLayer/> </ambiguous-words> <!-- <remove-not-similar-matches class="Action"> <target>documents.sections.layer:fcu-tomap[0.4 > @similarity ]</target> <action>delete</action> <deleteElements>true</deleteElements> </remove-not-similar-matches> --> <!-- filter overlapping annotations -->
351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420
<overlaps class="RemoveOverlaps"> <layerName>fcu-tomap</layerName> <removeEqual/> <removeIncluded>true</removeIncluded> <removeOverlapping>true</removeOverlapping> </overlaps> </filter> <export> <annotations href="modules/export.plan"> <outFile>fcu-tomap-annotations.csv</outFile> <outLayerName>documents.sections.(layer:fcu-tomap)</outLayerName> </annotations> <most-prominent-words> <!-- calculate tf-idf score with standard parameters --> <tfidf-score class="KeywordsSelector"> <!-- input terms --> <keywords>sections.layer:fcu-tomap</keywords> <keywordForm>@skos-prefLabel</keywordForm> <!-- parameters --> <scoreFunction>tfidf</scoreFunction> <scoreThreshold>-1000</scoreThreshold> <!-- save --> <outFile>output/annotations/fcu-tomap-tfidf.csv</outFile> </tfidf-score> <!-- calculate bm25 score with standard parameters --> <bm25-score class="KeywordsSelector"> <!-- input terms --> <keywords>sections.layer:fcu-tomap</keywords> <keywordForm>@skos-prefLabel</keywordForm> <documentId>document.@id</documentId> <!-- parameters --> <scoreFunction type="bm25" k1="1.2" b="0.75"/> <scoreThreshold>-1000</scoreThreshold> <!-- save --> <outFile>output/annotations/fcu-tomap-bm25.csv</outFile> </bm25-score> </most-prominent-words> </export> </fcu> <ppdo> <project> <!-- find tomap candidates --> <classify class="TomapProjector"> <!-- classify lemmas of candidates similar to those from the layer words (also lemmas) --> <subject layer="words" feature="lemma"/> <tomapClassifier graylist="resources/tomap/tomap-graylist.txt" >resources/tomap/ppdo_20210726.tomap</tomapClassifier> <lemmaKeys/> <!-- save those candidates and their score of similarity to yatea file --> <yateaFile output-feed="true">../yatea.xml</yateaFile> <scoreFeature>similarity</scoreFeature> <!--save to a layer with some additional information --> <conceptFeature>IRI</conceptFeature> <explanationFeaturePrefix>tomap-</explanationFeaturePrefix> <targetLayerName>ppdo-tomap</targetLayerName> </classify> <!-- map preflabels to labels of candidates --> <map-label class="FileMapper"> <!--take into account a tabular file with columns : any label, iri , preflabel--> <mappingFile>resources/tomap/ppdo_20210726.txt</mappingFile> <keyColumn>1</keyColumn> <targetFeatures>,,skos-prefLabel</targetFeatures> <target>documents.sections.layer:ppdo-tomap</target> <form>@IRI</form> </map-label> </project>
421422423424425426427428429430431432433434435436437438439440441442443444
<filter> <!-- filter overlapping annotations --> <overlaps class="RemoveOverlaps"> <layerName>ppdo-tomap</layerName> <removeEqual/> <removeIncluded>true</removeIncluded> <removeOverlapping>true</removeOverlapping> </overlaps> </filter> <export> <annotations href="modules/export.plan"> <outFile>ppdo-tomap-annotations.csv</outFile> <outLayerName>documents.sections.(layer:ppdo-tomap)</outLayerName> </annotations> </export> </ppdo> </to-map> </alvisnlp-plan>