4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
12 {-# LANGUAGE NoImplicitPrelude #-}
14 module Gargantext.RCT where
16 import Gargantext.Prelude
20 --import Data.Text (Text, words)
21 --import Data.Attoparsec.Text (anyChar, isEndOfLine, Parser, takeTill, many1, endOfLine, space, manyTill)
22 --import Control.Applicative (many)
24 -- RCT is the acronym for Referential ConText (of Text)
25 -- at the begin there was a byte
29 -- then a list of chars called a string, we call it a Form
30 -- (removing all weird charachters which are not alphanumeric)
32 -- Form -> RCT Sentence
34 -- These forms compose the RCT Sentence
35 -- an ngrams is composed with multiple forms
37 -- Paragraph = [Sentence]
39 -- type Title = Paragraph
40 -- data Block = [Paragraph]
41 -- Block is taken form Pandoc
43 -- data Document = [Block]
49 -- Paragraph (abstract + title)
50 -- Sentence - Ngrams - Forms
54 --separateurs :: Parser Text
55 --separateurs = dropWhile isEndOfLine
57 --paragraphs :: Parser [Text]
58 --paragraphs = many paragraph
60 --paragraph :: Parser Text
61 --paragraph = takeTill isEndOfLine <* many1 endOfLine
63 -- forms :: Text -> [Text]