2 Module : Gargantext.Core.Text.Corpus.Parsers.FrameWrite
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
12 module Gargantext.Core.Text.Corpus.Parsers.FrameWrite
15 import Control.Applicative ((*>))
16 import Control.Monad (void)
19 import Data.Text hiding (foldl)
20 import Gargantext.Core.Text (sentences)
21 import Gargantext.Prelude
22 import Prelude ((++), read)
23 import Text.Parsec hiding (Line)
24 import Text.Parsec.String
25 import qualified Data.Text as DT
26 import qualified Data.List as List
29 -- https://gitlab.iscpif.fr/gargantext/purescript-gargantext/issues/331
31 -- Authors : default : anonymous ; except if the following line is encountered
32 -- ^authors: FirstName1, LastName1 ; FirstName2, LastName2 ; etc.
33 -- date : default : date of last change except if the following line is encountered ^@@date: 2021-09-10
34 -- source: Name of the root node except if the following line is encountered ^@@source:
35 -- By default, 1 framawrite node = 1 document. Option for further developments: allow to give a level at generation for the split within framawrite node : :
37 -- par défaut: un doc == 1 NodeWrite
38 -- ## mean each ## section will be a new document with title the subsubsection title. Either it features options for author, date etc. or it will inherit the document's option.
46 -- , "^@@authors: FirstName1, LastName1; FirstName2, LastName2"
48 , "source: someSource"
49 , "document contents 1"
50 , "document contents 2"
53 sampleUnordered :: Text
59 , "document contents 1"
61 , "authors: FirstName1, LastName1; FirstName2, LastName2"
62 , "source: someSource"
63 , "document contents 2"
66 -- parseSample = parse documentP "sample" (unpack sample)
67 -- parseSampleUnordered = parse documentP "sampleUnordered" (unpack sampleUnordered)
68 parseLinesSample :: Either ParseError Parsed
69 parseLinesSample = parseLines sample
70 parseLinesSampleUnordered :: Either ParseError Parsed
71 parseLinesSampleUnordered = parseLines sampleUnordered
74 Author { firstName :: Text
79 Parsed { title :: Text
82 , source :: Maybe Text
95 Date { year :: Integer
108 parseLines :: Text -> Either ParseError Parsed
109 parseLines text = foldl f emptyParsed <$> lst
111 lst = parse documentLinesP "" (unpack text)
112 f (Parsed { .. }) (LAuthors as) = Parsed { authors = as, .. }
113 f (Parsed { .. }) (LContents c) = Parsed { contents = concat [contents, c], .. }
114 f (Parsed { .. }) (LDate d ) = Parsed { date = Just d, .. }
115 f (Parsed { .. }) (LSource s ) = Parsed { source = Just s, .. }
116 f (Parsed { .. }) (LTitle t ) = Parsed { title = t, .. }
118 documentLinesP :: Parser [Line]
121 ls <- lineP `sepBy` newline
122 pure $ [LTitle $ pack t] ++ ls
126 choice [ try authorsLineP
131 authorsLineP :: Parser Line
134 pure $ LAuthors authors
136 dateLineP :: Parser Line
141 sourceLineP :: Parser Line
144 pure $ LSource $ pack source
146 contentsLineP :: Parser Line
148 contents <- many (noneOf "\n")
149 pure $ LContents $ pack contents
155 -- a <- optionMaybe authorsP
156 -- d <- optionMaybe dateP
157 -- s <- optionMaybe sourceP
159 -- pure $ Parsed { title = pack t
160 -- , authors = fromMaybe [] a
161 -- , date = pack <$> d
162 -- , source = pack <$> s
163 -- , contents = pack c }
165 titleDelimiterP :: Parser ()
168 -- _ <- try (string "==")
170 titleP :: Parser [Char]
171 titleP = manyTill anyChar (try titleDelimiterP)
173 authorsPrefixP :: Parser [Char]
175 _ <- string "authors:"
177 authorsP :: Parser [Author]
178 authorsP = try authorsPrefixP *> sepBy authorP (char ';')
179 authorP :: Parser Author
181 fn <- manyTill anyChar (char ',')
183 --ln <- manyTill anyChar (void (char ';') <|> tokenEnd)
184 --ln <- manyTill anyChar (tokenEnd)
185 ln <- many (noneOf "\n")
186 pure $ Author { firstName = pack fn, lastName = pack ln }
187 -- manyTill anyChar (void (char '\n') <|> eof)
189 datePrefixP :: Parser [Char]
194 dateP = try datePrefixP
196 -- *> many (noneOf "\n")
198 dateISOP :: Parser Date
200 year <- rd <$> number
202 month <- rd <$> number
205 _ <- many (noneOf "\n" )
206 pure $ Date { year, month, day }
208 rd = read :: [Char] -> Integer
211 sourcePrefixP :: Parser [Char]
213 _ <- string "source:"
215 sourceP :: Parser [Char]
216 sourceP = try sourcePrefixP
217 *> many (noneOf "\n")
219 -- contentsP :: Parser String
220 -- contentsP = many anyChar
222 tokenEnd :: Parser ()
223 tokenEnd = void (char '\n') <|> eof
227 text2paragraphs :: Int -> Text -> [Text]
228 text2paragraphs n = List.map DT.concat
229 . splitEvery n . List.map clean
230 . sentences . DT.concat . DT.lines
232 clean :: Text -> Text
233 clean = DT.unwords . List.filter (\w -> DT.length w < 25) . DT.words