Commit 3f60b535 authored by rdecoupe's avatar rdecoupe
Browse files

add biotex properties generation

parent 4c392231
# Wrap Biotex in API REST (python)
This project aims to deploy an API Rest around BioTex. **BioTex is a Automated Term Extractor** (see [here](http://tubo.lirmm.fr/biotex/index.jsp) for more details)
BioTex need a POS (Part of Speech) tagger. Its author suggest [TreeTagger](https://www.cis.uni-muenchen.de/~schmid/tools/TreeTagger/)
This repository is based upon the work of **Jacques Fize** who build a python wrapper of Biotext (see [his repository](https://gitlab.irstea.fr/jacques.fize/biotex_python) for more details)
\ No newline at end of file
......@@ -24,31 +24,61 @@
name:
- openjdk-8-jdk
- name: Prepare Download directory
become: yes
- name: "POS : Prepare Download directory"
file:
state: directory
path: /root/ansible-dl
path: "~/TreeTagger"
- name: "POS : Download TreeTagger"
get_url:
url: "https://www.cis.uni-muenchen.de/~schmid/tools/TreeTagger/data/tree-tagger-linux-3.2.2.tar.gz"
dest: "~"
dest: "~/TreeTagger"
- name: "POS : DownloadTreeTagger tagging scripts"
get_url:
url: "https://www.cis.uni-muenchen.de/~schmid/tools/TreeTagger/data/tagger-scripts.tar.gz"
dest: "~"
dest: "~/TreeTagger"
- name: "POS : Download TreeTagger install script"
get_url:
url: "https://www.cis.uni-muenchen.de/~schmid/tools/TreeTagger/data/install-tagger.sh"
dest: "~"
dest: "~/TreeTagger"
- name: "POS : Download French package"
get_url:
url: "https://www.cis.uni-muenchen.de/~schmid/tools/TreeTagger/data/french.par.gz"
dest: "~"
dest: "~/TreeTagger"
- name: "POS : install TreeTagger"
command: "sh ~/install-tagger.sh"
\ No newline at end of file
shell: "cd TreeTagger && sh install-tagger.sh"
- name: "biotex : Prepare Download directory"
file:
state: directory
path: "~/biotex"
- name: "git clone biotex"
git:
repo: https://github.com/sifrproject/biotex.git
dest: "~/biotex"
version: maven #Branch maven for build
force: yes #otherwise, ansible error if repo existing in local
- name: "Maven build"
shell: "cd biotex && mvn package"
- name: "biotex : corpus directory"
file:
state: directory
path: "~/corpus"
- name: "biotex : output directory"
file:
state: directory
path: "~/output"
- name: "biotex: biotex.properties"
template:
src: templates/biotex.properties.j2
dest: "~/biotex/biotex.properties"
#source patterns directory
biotex.source_patterns = {{ ansible_env.HOME }}/biotex/patterns
biotex.source_dataset_reference = {{ ansible_env.HOME }}/biotex/dataSetReference
biotex.source_stop_words = {{ ansible_env.HOME }}/biotex/stopWords
biotex.source_treetagger = {{ ansible_env.HOME }}/TreeTagger
biotex.output={{ ansible_env.HOME }}/output
biotex.input_file={{ ansible_env.HOME }}/corpus/corpus.txt
#(all|multi)
biotex.type_of_terms=all
#french, english or spanish
biotex.language=french
biotex.min_term_frequency=1
#Default: 200
biotex.number_of_patterns=200
#For one document : LValue CValue
# For a set of documents : LIDFValue F-OCapi_A F-OCapi_M F-OCapi_S F-TFIDF-C_A F-TFIDF-C_M F-TFIDF-C_S
# TFIDF_A TFIDF_M TFIDF_S Okapi_A Okapi_M Okapi_S
biotex.measure=Okapi_A
#1 = single file (only for LValue or CValue)
#2 = set of files (for LIDF-value or any measure)
biotex.source_type=2
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment