108 lines
4.4 KiB
HTML
108 lines
4.4 KiB
HTML
#HTTP_HEADER{Content-Type: application/json; charset=#CHARSET}
|
|
[(#ENV{id}|setenv{id_jsonl})]
|
|
[{
|
|
"header":{
|
|
"id": "id jsonl",
|
|
"site": "Site",
|
|
"date_aspi": "Date archive",
|
|
"num_page": "Numéro page",
|
|
"title": "Meta title",
|
|
"desc": "Meta description",
|
|
"keywords": "Meta keywords",
|
|
"trackers": "Trackers",
|
|
"rezos": "Rézos socios",
|
|
"titres": "titres",
|
|
"liens_ext": "Liens externes",
|
|
"liens_int": "Liens internes",
|
|
"text": "Texte"
|
|
},
|
|
"filtreCol" : {
|
|
"site" : "select",
|
|
"date_aspi": "select",
|
|
"num_page": "select",
|
|
"title": "input",
|
|
"desc": "input",
|
|
"keywords": "input",
|
|
"trackers": "select",
|
|
"rezos": "select",
|
|
"titres": "input",
|
|
"liens_ext": "input",
|
|
"liens_int": "input",
|
|
"text": "input"
|
|
},
|
|
"classes":{
|
|
"num_page": "w80p",
|
|
"title": "w120p",
|
|
"desc": "w120p",
|
|
"keywords": "w200p",
|
|
"trackers": "w100p",
|
|
"rezos": "w100p",
|
|
"titres": "minw400p",
|
|
"liens_int": "w400p maxw400p",
|
|
"text": "minw400p"
|
|
}
|
|
} #SET{id_jsonl,1}
|
|
<BOUCLE_corpus(DATA){source ls, #CHEMIN{tmp}/#CONFIG{corpus_web/repertoire_corpus,corpus_jsonl}/*}{par basename}>
|
|
[(#SET{site,#FILE|basename|recup_site})] [(#SET{date_aspi,#FILE|basename|recup_aspi})]
|
|
#SET{liste_lignes,#LISTE{1}} #SET{trop_gros,#CONFIG{corpus_web/taille_max}|mult{1000000}}
|
|
#SET{nb_lignes, #FILE|nombre_ligne} [(#GET{nb_lignes}|!={1}|oui) #SET{liste_lignes,#LISTE{1,#GET{nb_lignes}}}]
|
|
<BOUCLE_2lignes(DATA){enum 1, #GET{nb_lignes}, #CONFIG{corpus_web/pas_corpus, 10}}{si #SIZE|<{#GET{trop_gros}}}>
|
|
[(#SET{num_ligne,[(#COMPTEUR_BOUCLE|=={1}|?{1,#GET{nb_lignes}})]})]
|
|
[(#SET{content,[(#FILE|affiche_ligne{#GET{num_ligne}}|json_decode{true})]})]
|
|
#SET{num_page,Page #VALEUR} #SET{title,''} #SET{keywords,''} #SET{description,''} #SET{titres,''} #SET{trackers,''} #SET{rezos,''} #SET{liens_ext,''} #SET{liens_int,''}
|
|
<BOUCLE_content(DATA){source table, #GET{content}}>
|
|
[(#CLE|=={extractionContent}|oui)
|
|
<BOUCLE_ss(DATA){source table, #VALEUR}>
|
|
[(#CLE|=={boilerpipe:text}|oui)
|
|
[(#SET{text, [(#VALEUR|print|supprimer_com|replace{<br>,§§}|replace{<br />,§§}|replace{<,<}|replace{>,>}|replace{§§,<br>}|replace{"\n", <br>})]})]
|
|
]
|
|
[(#CLE|=={htmlmeta:head:title}|oui) [(#SET{title, [(#VALEUR|print|replace{<br>,§§}|replace{<br />,§§}|replace{<,<}|replace{>,>}|replace{§§,<br>}|replace{"\n", <br>})]})] ]
|
|
[(#CLE|=={htmlmeta:head:keywords}|oui) [(#SET{keywords, [(#VALEUR|print|replace{<br>,§§}|replace{<br />,§§}|replace{<,<}|replace{>,>}|replace{§§,<br>}|replace{"\n", <br>}|replace{',', ', '})]})] ]
|
|
[(#CLE|=={htmlmeta:head:description}|oui) [(#SET{description, [(#VALEUR|print|replace{<br>,§§}|replace{<br />,§§}|replace{<,<}|replace{>,>}|replace{§§,<br>}|replace{"\n", <br>})]})] ]
|
|
</BOUCLE_ss>
|
|
][(#CLE|=={htmlBytes}|oui)
|
|
[(#SET{titres, [(#VALEUR|base64_decode|recup_titres)]})]
|
|
[(#SET{trackers, [(#VALEUR|base64_decode|trouve_trackers)]})]
|
|
[(#SET{rezos, [(#VALEUR|base64_decode|trouve_rezos)]})]
|
|
[(#SET{liens_ext, [(#VALEUR|base64_decode|recup_liens_ext)]})]
|
|
[(#SET{liens_int, [(#VALEUR|base64_decode|recup_liens_int)]})]
|
|
]
|
|
</BOUCLE_content>
|
|
,{
|
|
"html": {
|
|
"id": [(#GET{id_jsonl})],
|
|
"site" : [(#GET{site}|json_encode)],
|
|
"date_aspi": [(#GET{date_aspi}|json_encode)],
|
|
"num_page": [(#GET{num_page}|json_encode)],
|
|
"title": [(#GET{title}|json_encode)],
|
|
"desc": [(#GET{description}|json_encode)],
|
|
"keywords": [(#GET{keywords}|json_encode)],
|
|
"trackers": [(#GET{trackers}|json_encode)],
|
|
"rezos": [(#GET{rezos}|json_encode)],
|
|
"titres": [(#GET{titres}|json_encode)],
|
|
"liens_ext": [(#GET{liens_ext}|json_encode)],
|
|
"liens_int": [(#GET{liens_int}|json_encode)],
|
|
"text": [(#GET{text}|json_encode)]
|
|
},
|
|
"classes": {
|
|
"titre":"transcription"
|
|
},
|
|
"search": {
|
|
"id": [(#GET{id_jsonl})],
|
|
"site" : [(#GET{site}|json_encode)],
|
|
"date_aspi": [(#GET{date_aspi}|json_encode)],
|
|
"num_page": [(#GET{num_page}|json_encode)],
|
|
"title": [(#GET{title}|json_encode)],
|
|
"desc": [(#GET{description}|json_encode)],
|
|
"keywords": [(#GET{keywords}|json_encode)],
|
|
"trackers": [(#GET{trackers}|sans_li|json_encode)],
|
|
"rezos": [(#GET{rezos}|sans_li|json_encode)],
|
|
"titres": [(#GET{titres}|sans_li|json_encode)],
|
|
"liens_ext": [(#GET{liens_ext}|sans_li|json_encode)],
|
|
"liens_int": [(#GET{liens_int}|sans_li|json_encode)],
|
|
"text": [(#GET{text}|sans_br|json_encode)]
|
|
}
|
|
} [(#SET{id_jsonl, #GET{id_jsonl}|plus{1}})]
|
|
</BOUCLE_2lignes>
|
|
</BOUCLE_corpus>
|
|
]
|