git.oblomov.eu Git - ohcount/blob - test/expected_dir/optimer

   1 shell   comment #!/bin/sh
   2 shell   comment # optimer – Masserer ordlistefilene til eit kjaptsøkt format.
   3 shell   comment #
   4 shell   comment # Copyright © 2008, 2009 Karl Ove Hufthammer <karl@huftis.org>.
   5 shell   comment #
   6 shell   comment #     This file is part of Ordbanken.
   7 shell   comment #
   8 shell   comment #     Ordbanken is free software: you can redistribute it and/or modify
   9 shell   comment #     it under the terms of the GNU General Public License as published by
  10 shell   comment #     the Free Software Foundation, either version 3 of the License, or
  11 shell   comment #     (at your option) any later version.
  12 shell   comment #
  13 shell   comment #     This program is distributed in the hope that it will be useful,
  14 shell   comment #     but WITHOUT ANY WARRANTY; without even the implied warranty of
  15 shell   comment #     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  16 shell   comment #     GNU General Public License for more details.
  17 shell   comment #
  18 shell   comment #     You should have received a copy of the GNU General Public License
  19 shell   comment #     along with this program.  If not, see <http://www.gnu.org/licenses/>.
  20 shell   blank
  21 shell   comment # Utfør kommandoen på fila oppgjeven som førsteargument.
  22 shell   code    fil=$1
  23 shell   code    echo $fil
  24 shell   blank
  25 shell   comment # Forklaring på filtreringskommandoane.
  26 shell   comment #  grep: Filtrer vekk kommentarar (linjer som startar med «*»).
  27 shell   comment #  fgrep: Filtrer unormerte ord.
  28 shell   comment #  sed: Gjer om mellomrom i siste del av linja (der kodane er) til tabulatorar.
  29 shell   comment #       (Korfor den kompliserte sed-kommandoen? Fordi kodane i utgangspunktet er
  30 shell   comment #       skilde med mellomrom i staden for med tabulatorar. Dette ville ikkje vore
  31 shell   comment #       noko problem om alle oppføringar besto av eitt ord, då «column» som
  32 shell   comment #       standard handterer mellomrom og tabulatorar likt, men ordbanken har
  33 shell   comment #       oppføringar som «på kryss og tvers», og då ville alle orda få kvar si
  34 shell   comment #       kolonne (bruk «på» som oppslagsord for å sjå oppføringa).
  35 shell   comment #  sed: Fjern kodar (på forma <kode1>) som inneheld tal (interne/uforståelige kodar).
  36 shell   comment #  sed: Fjern kodar («ord» utan <>) som startar med tal (interne/uforståelige kodar).
  37 shell   comment #  sed: Fjern talkoden på starten av linja.
  38 shell   comment #  tr: Slå saman etterfølgjande tabulatorar til éin.
  39 shell   comment #  sort: Sorter fila (slik at oppslag med «look» går raskare).
  40 shell   code    grep -v '^\*' $fil \
  41 shell   code    | fgrep -v "unormert" \
  42 shell   code    | sed -r 'h;s/^([^      ]+      [^      ]+      [^      ]+      )(.*)/\2/;s/ /  /g;G;s/(.*)\n([^        ]+      [^      ]+      [^      ]+      )(.*)/\2\1/' \
  43 shell   code    | sed -r 's/<[^>]*[0-9][^>]*>+/ /g' \
  44 shell   code    | sed -r 's/    [0-9]+[^        ]*/     /g' \
  45 shell   code    | sed -r 's/^[0-9]+\s+//' \
  46 shell   code    | tr -s '\t' \
  47 shell   code    | LC_ALL=C sort > "${fil%.txt}.dat"