/usr/share/ada/adainclude/texttools/strings.adb

------------------------------------------------------------------------------
-- STRINGS (package body)                                                   --
--                                                                          --
-- Part of TextTools                                                        --
-- Designed and Programmed by Ken O. Burtch                                 --
--                                                                          --
------------------------------------------------------------------------------
--                                                                          --
--                 Copyright (C) 1999-2007 Ken O. Burtch                    --
--                                                                          --
-- This is free software;  you can  redistribute it  and/or modify it under --
-- terms of the  GNU General Public License as published  by the Free Soft- --
-- ware  Foundation;  either version 2,  or (at your option) any later ver- --
-- sion.  This is distributed in the hope that it will be useful, but WITH- --
-- OUT ANY WARRANTY;  without even the  implied warranty of MERCHANTABILITY --
-- or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License --
-- for  more details.  You should have  received  a copy of the GNU General --
-- Public License  distributed with this;  see file COPYING.  If not, write --
-- to  the Free Software Foundation,  59 Temple Place - Suite 330,  Boston, --
-- MA 02111-1307, USA.                                                      --
--                                                                          --
-- As a special exception,  if other files  instantiate  generics from this --
-- unit, or you link  this unit with other files  to produce an executable, --
-- this  unit  does not  by itself cause  the resulting  executable  to  be --
-- covered  by the  GNU  General  Public  License.  This exception does not --
-- however invalidate  any other reasons why  the executable file  might be --
-- covered by the  GNU Public License.                                      --
--                                                                          --
-- This is maintained at http://www.pegasoft.ca/tt.html                     --
--                                                                          --
------------------------------------------------------------------------------

with Ada.Strings.Fixed;
with Ada.Strings.Maps.Constants;

package body strings is

dips : constant string := "upanlyscolableutalisifensusteasauayeeieoeseyiaotoouuichetirontrshaithoaghurngeregundewhbackamedorvarine a d f o n r s  e_r_s_e.";

Case_Mappings : constant array (Boolean) of Ada.Strings.Maps.Character_Mapping
  := (True => Ada.Strings.Maps.Constants.Upper_Case_Map,
      False => Ada.Strings.Maps.Identity);

procedure FixSpacing( s : in out unbounded_string ) is
-- remove leading and trailing spaces, as well as any double-spaces inside
   i  : Integer := 1;
begin
   Trim (S, Side => Ada.Strings.Both);
   while i < length(s) loop
    if Element( s, i ) = ' ' and then Element( s, i+1 ) = ' ' then
       Delete( s, i, i );
       i := i - 1;
    end if;
    i := i + 1;
  end loop;
end FixSpacing;

function PhoneticsOf( s : string ) return String is
-- reduce string to ENGLISH phonetics
-- equivalences from Talking Tools pg.12 (and from guessing)
  pos  : natural := S'First;                  -- position in s
  ppos : natural := 1;                  -- position in PhoneticString
  PhoneticString : Unbounded_String := Null_Unbounded_String; -- the resulting phonetics
  ch : character;                       -- current character in s
  AllowDuplicate : boolean := false;    -- TRUE to discard same adjacents

  function NextChar return character is
    -- get the next character (if none, return a space)
    ch : character;
  begin
    if Pos < S'Last then
       Ch := S (pos+1);
       if ch >= 'A' and ch <= 'Z' then
          Ch := character'val( character'pos(ch) + 32 );
       end if;
       return ch;
    else
       return ' ';
    end if;
  end NextChar;

  procedure Add( c : character ) is
  -- add a phoeme to the Phonetic String, discarding adjacent duplicates
  -- if it's OK.  Some very similar sounds are grouped together (th & d)
  begin
     if ppos = 1 or AllowDuplicate then
	Append( PhoneticString, c );
	ppos := ppos + 1;
	AllowDuplicate := false;
     else
	if Element( PhoneticString, ppos-1 ) /= c then
	   Append( PhoneticString, c );
	   ppos := ppos + 1;
	end if;
     end if;
  end Add;

  procedure SkipChar is
  -- macro to advance to next position in s
  begin
    pos := pos + 1;
  end SkipChar;
  pragma Inline( SkipChar );

begin
   while Pos <= S'Last loop
      ch := S (Pos);
      if ch >= 'A' and ch <= 'Z' then
         ch := character'val( character'pos(ch) + 32 );
      end if;
      case ch is
      when 'a' =>
        case NextChar is
        when 'a'|'e'|'i'|'y' =>                        -- aa, ae, ai, ay
           Add( 'A' );
           SkipChar;
        when 'r' =>                                    -- ar
           Add( 'R' );
           SkipChar;
        when 'u' =>                                    -- au
           Add( 'U' );
           SkipChar;
        when others =>
           Add( 'A' );                                 -- a
        end case;
      when 'b' =>                                      -- b
        Add( 'B' );
      when 'd' =>                                      -- d
        Add( 'D' );
      when 't' =>
        if NextChar = 'h' then                         -- th (H)
           Add( 'H' );
           SkipChar;
        else
           Add( 'D' );                                 -- t (=d)
        end if;
      when 'p' =>
        if NextChar = 'h' then                         -- ph (F)
           Add( 'F' );
           SkipChar;
        else
           Add( 'P' );                                 -- p
        end if;
      when 'c' =>                                      -- c*
         if NextChar = 'h' then                        -- ch (Y)
            Add( 'Y' );
            SkipChar;
         else
            Add( 'C' );
         end if;
      when 'e' =>
        case NextChar is
        when 'a' => Add( 'E' ); SkipChar;              -- ea
        when 'i' => Add( 'I' ); SkipChar;              -- ei
        when 'e' => Add( 'E' ); SkipChar;              -- ee
        when 'r' => Add( 'R' ); SkipChar;              -- er
        when 'u' => Add( 'U' ); SkipChar;              -- eu
        when 'y' => Add( 'A' ); SkipChar;              -- ey
        when ' '|'?'|'''|':'|';'|'.'|',' => SkipChar; -- e (silent)
        when others =>                                 -- e
             Add( 'E' );
        end case;
      when 'f' =>                                      -- f
        Add( 'F' );
      when 'g' =>                                      -- gh
        if NextChar = 'h' then
           SkipChar;
        else
           Add( 'G' );                                 -- g*
        end if;
      when 'h' =>                                      -- h
        null;
      when 'i' =>                                      -- i
        if NextChar = 'e' then                         -- ie
           Add( 'E' );
           SkipChar;
        elsif NextChar = 'r' then                      -- ir
           Add( 'R' );
           SkipChar;
        elsif NextChar = 'o' then                      -- ion
           pos := pos + 1;
           if NextChar = 'n' then
              Add( 'U' );
              Add( 'N' );
              SkipChar;
           else
              pos := pos - 1; -- treat normally
              Add( 'I' );
           end if;
        else
           Add( 'I' );
        end if;
      when 'j' =>                                      -- j
        Add( 'J' );
      when 'k'|'q' =>                                  -- k
        Add('K');
        if NextChar = 'u' then                         -- qu (KW)
           Add( 'W' );
           SkipChar;
        end if;
      when 'l'|'r' =>                                  -- l, r
        Add( 'R' );
      when 'm' =>                                      -- m
        Add( 'N' );
      when 'n' =>
        if NextChar = 'g' then
           SkipChar;                                   -- ng (=n)
        end if;
        Add( 'N' );                                    -- n
      when 'o' =>
        case NextChar is
        when 'a' =>                                    -- oa
             Add( 'O' );
             SkipChar;
        when 'o' =>                                    -- oo
             Add( 'U' );
             SkipChar;
        when 'r' =>                                    -- or
             Add( 'R' );
             SkipChar;
        when 'u' =>                                    -- ou
             Add( 'U' );
             SkipChar;
        when others =>                                 -- o
             Add( 'O' );
        end case;
      when 's' =>                                      -- sh (H)
        if NextChar = 'h' then
           Add( 'H' );
           SkipChar;
        else
           Add( 'S' );                                 -- s
        end if;
      when 'u' =>
        if NextChar = 'y' then                         -- uy
           Add( 'I' );
           SkipChar;
        elsif NextChar = 'r' then                      -- ur
           Add( 'R' );
           SkipChar;
        else
           Add ( 'U' );                                -- u
        end if;
      when 'v' =>                                      -- v
        Add( 'V' );
      when 'w' =>                                      -- w
        Add( 'W' );
      when 'x'|'z' =>                                  -- x, z
        Add( 'Z' );
      when 'y' =>                                      -- y
        Add( 'I' );
      when others =>
        AllowDuplicate := true;  -- allow two together if sep by sp, ', etc
        if ch >= '0' and ch <= '9' then                -- 0...9
           Add( ch );
           AllowDuplicate := true;
        end if;
      end case;
      pos := pos + 1;
  end loop;
  return To_String (PhoneticString);
end PhoneticsOf;

function TypoOf( BadString, GoodString : String) return boolean is
   -- 80% of all typos are single insertions, deletions, exchanges, or subs.
begin
   if BadString = GoodString
     or BadString'Length < 4
     or GoodString'Length < 4 then
      -- identical or too short to test reliably?
      return false;
   end if;
   
   -- Single Insertion
   if BadString'Length = GoodString'Length + 1 then
      for I in BadString'Range loop
         if BadString (Badstring'First .. I - 1)
	   & Badstring (I + 1 .. Badstring'Last) = GoodString then
            return True;
         end if;
      end loop;
   end if;
   
   -- Single Deletion
   if BadString'Length = GoodString'Length - 1 then
      for i in GoodString'Range loop
         if GoodString (Goodstring'First .. I - 1)
	   & Goodstring (I + 1 .. Goodstring'Last) = BadString then
	    return True;
         end if;
      end loop;
   end if;
   
   -- Single Exchange or Substitution
   if BadString'Length = GoodString'Length then
      declare
	 TempStr : String := BadString;
	 Tempchar : Character;
      begin
	 for i in Badstring'First .. BadString'Last - 1 loop
	    TempChar := tempstr (I);
	    tempstr (I) := Tempstr (I + 1);
	    tempstr (I + 1) := Tempchar;
	    if TempStr = GoodString then
	       return True;
	    end if;
	    Tempstr (I .. I + 1) := Badstring (I .. I + 1);
	    
	    Tempstr (I) := Goodstring (I - Tempstr'First + Goodstring'First);
	    if Tempstr = Goodstring then
	       return True;
	    end if;
	    Tempstr (I) := Badstring (I);
	 end loop;
      end;
   end if;
   
   return False;
end TypoOf;

procedure Tokenize (S     : in string;
                    Words : in out strlist.Vector;
                    ch    : in out character ) is
   -- encode a word as a character > 127
   Index : Natural;
begin
   Index := Words.Find_Index (S);
   if Index = 0 or Index > 128 then
      ch := character'val( Index ); --' ';
   else
      ch := character'val( Index + 127 );
   end if;
end Tokenize;

procedure Untokenize (Ch    : in     Character;
                      Words : in out Strlist.Vector;
                      S     : in out unbounded_string) is
begin
   s := Null_Unbounded_String;
   if character'pos( ch ) > 127 then
      S := To_Unbounded_String (Words.Element (Character'Pos (Ch) - 127));
   end if;
end Untokenize;

function FGREP (s : string;
		text : string;
		filter_out : boolean := false;
		case_insensitive : boolean := false )
	       return Boolean
is
begin
   return Ada.Strings.Fixed.Index (Text,
				   S,
				   Mapping => Case_Mappings (Case_Insensitive)) > 0
     xor Filter_Out;
end FGREP;

function FGREP (s : string;
		text : string;
		filter_out : boolean := false;
		case_insensitive : boolean := false )
	       return String
is
begin
   if FGREP (S, Text, Filter_Out, Case_Insensitive) then
      return text;
   else
      return "";
   end if;
end FGREP;

procedure FGREP (s : in String;
		 text : in Strlist.Vector;
		 result : out boolean;
		 filter_out : boolean := false;
		 case_insensitive : boolean := false )
is
begin
   Result := False;
   for I in 1 .. Integer (Text.Length) loop
      Result := FGREP (S, Text.Element (I), Filter_Out, Case_Insensitive);
      exit when Result;
   end loop;
end FGREP;

procedure FGREP (s : string;
                 text : in out Strlist.Vector;
		 filter_out : boolean := false;
		 case_insensitive : boolean := false )
is
   I : Positive := 1;
begin
   while I <= Integer (Text.Length) loop
      if FGREP (S, Text.Element (I), Filter_Out, case_insensitive) then
	 I := I + 1;
      else
	 Text.Delete (I);
      end if;
   end loop;
end FGREP;

---> ASCII Encode/Decode

separator : constant character := character'val(1);

procedure Encode( estr : in out EncodedString; i : integer ) is
begin
   Append( estr, integer'image( i ) );
   Append( estr, separator );
end Encode;

procedure Encode( estr : in out EncodedString; r : ARect ) is
begin
  Encode( estr, r.left );
  Encode( estr, r.top );
  Encode( estr, r.right );
  Encode( estr, r.bottom );
end Encode;

procedure Encode( estr : in out EncodedString; l : long_integer ) is
begin
  Append( estr, long_integer'image( l ) );
  Append( estr, separator );
end Encode;

procedure Encode( estr : in out EncodedString; s : string) is
begin
   Append( estr, s);
   Append( estr, separator );
end Encode;

procedure Encode( estr : in out EncodedString; c : character ) is
begin
  Append( estr, c );
end Encode;

procedure Encode( estr : in out EncodedString; b : boolean ) is
begin
  if b then
     Append( estr, 'T' );
  else
     Append( estr, 'F' );
  end if;
end Encode;

procedure Decode( estr : in out EncodedString; i : out integer ) is
   idx : integer := 1;
begin
   while Element( estr, idx ) /= separator loop
      idx := idx + 1;
   end loop;
   i := integer'Value (Slice (estr, 1, Idx - 1));
   Tail (estr, Length (Estr) - Idx);
end Decode;

procedure Decode( estr : in out EncodedString; r : out ARect ) is
begin
  Decode( estr, r.left );
  Decode( estr, r.top );
  Decode( estr, r.right );
  Decode( estr, r.bottom );
end Decode;

procedure Decode( estr : in out EncodedString; l : out long_integer ) is
  idx : integer := 2;
begin
  while Element( estr, idx ) /= separator loop
    idx := idx + 1;
  end loop;
  l := long_integer'Value (Slice (estr, 1, Idx - 1));
  Tail (estr, Length (Estr) - Idx);
end Decode;

procedure Decode( estr : in out EncodedString; s : out Unbounded_String) is
   pos : constant Natural := Index (Estr, (1 => Separator));
begin
   s := Head (estr, pos - 1 );
   Tail (estr, Length (Estr) - Pos);
end Decode;

procedure Decode( estr : in out EncodedString; c : out character ) is
begin
  c := Element( estr, 1 );
  Tail (estr, Length (Estr) - 1);
end Decode;

procedure Decode( estr : in out EncodedString; b : out boolean ) is
  c : character := ASCII.NUL;
begin
   Decode( estr, c );
   pragma Assert (C = 'T' or C = 'F');
   b := (c = 'T');
end Decode;

--  BASIC PACK
--
-- Compress string s using dipthong compression resulting in a new string of
-- 50% to 100% the size of the original.  s must contain only lower ASCII
-- characters since the upper ASCII characters are used for the compression.
------------------------------------------------------------------------------

function basic_pack( s : string ) return packed_string is
   dip : string(1..2);
   i : positive;
   dip_pos : natural;
   result : unbounded_string;
begin
   i := s'first;
   result := null_unbounded_string;
   loop
     exit when i > s'last;
     dip_pos := 0;
     if i /= s'last then
        dip := s(i..i+1);
        for j in dips'first..dips'last-1 loop
            if dip = dips(j..j+1) then
               dip_pos := j;
               exit;
            end if;
        end loop;
     end if;
     if dip_pos > 0 then
        result := result & character'val( dip_pos + 127 );
        i := i + 2;
     else
        result := result & s(i);
        i := i + 1;
     end if;
   end loop;
   return packed_string( to_string( result ) );
end basic_pack;

--  UNPACK
--
-- Decompress string s that was compressed using basic_pack.
------------------------------------------------------------------------------

function unpack( s : packed_string ) return string is
   dip_pos : positive;
   newstr : unbounded_string;
begin
   for i in s'range loop
       if character'pos( s(i) ) >= 128 then
          dip_pos := character'pos( s(i) ) - 127;
          newstr := newstr & dips( dip_pos..dip_pos+1 );
       else
          newstr := newstr & s(i);
       end if;
   end loop;
   return to_string( newstr );
end unpack;

end strings;
libtexttools4-dev 2.1.0-6build2 / usr / share / ada / adainclude / texttools / strings.adb