Last active
November 19, 2017 06:20
-
-
Save ZzZombo/e78cad56722eda7f9fe8f40b33ad6bb5 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
use NativeCall; | |
class Encoding::UCS2::Encoder does Encoding::Encoder | |
{ | |
my Str:D $endianness=nativecast(CArray[uint8],Buf[uint16].new(0x1234))[0].base(16)==12 ?? 'BE' !! 'LE'; | |
my Junction:D $valid-range=(0x0000..0xD7FF)|(0xE000..0xFFFF); | |
has Bool:D $!write-bom=Bool::True; | |
has Bool:D $!translate-nl=Bool::True; | |
has Str:D $!target-endianness='BE'; | |
has Str:D $!replacement='�'; | |
submethod TWEAK(:$!target-endianness where 'BE'|'LE'='BE',:$!write-bom=Bool::True,:$!replacement where {.ord ~~ $valid-range}='�', | |
:$!translate-nl=Bool::True) | |
{ | |
} | |
method encode-chars(Str:D $input is copy --> Blob:D) | |
{ | |
my $encoded=Buf[uint8].new; | |
$input.subst-mutate(/\r?\n/,"\r\n") if $!translate-nl && $*VM.osname ~~ /win/; | |
my @cps=$input.ords; | |
@cps.unshift(0xFEFF) if $!write-bom; | |
$encoded.append(@cps.map( | |
{ | |
my $cp=$_ ~~ $valid-range ?? $_ !! $!replacement.ord; | |
my uint8 @cus=$cp % 256,$cp +> 8 % 256; #code units. | |
|($!target-endianness ne $endianness ?? @cus.reverse !! @cus); | |
})); | |
$encoded; | |
} | |
} | |
class Encoding::UCS2::Decoder does Encoding::Decoder | |
{ | |
has Buf:D $!bytes=Buf[uint8].new; | |
has UInt:D $!position=0; | |
has Str:D @!seps; | |
method !decode(UInt:D $amount=$!bytes.elems --> Str:D) | |
{ | |
say "DEC $!position,$amount -> {$!position+$amount}/$!bytes.elems()"; | |
my $result=$!bytes.subbuf($!position,$amount).decode: 'utf-16'; | |
#"<$result>".say; | |
$!position += $amount; | |
$!position min= $!bytes.elems; | |
$result; | |
} | |
method add-bytes(Blob:D $bytes --> Nil) | |
{ | |
say "ADD $!position+$bytes.elems() ($bytes.gist()) -> {$!position+$bytes.elems}/$!bytes.elems()"; | |
$!bytes.append: $bytes; | |
} | |
method consume-available-chars(--> Str:D) | |
{ | |
"CAVC".say; | |
self!decode: $_ %% 2 ?? $_ !! $_ - 1 given $.bytes-available; | |
} | |
method consume-all-chars(--> Str:D) | |
{ | |
"CALC".say; | |
$.consume-available-chars; | |
} | |
method consume-exactly-chars(int $chars, Bool:D :$eof = False --> Str) | |
{ | |
"CEC $chars, $eof".say; | |
return Str if $.bytes-available div 2 < $chars; | |
self!decode: $chars*2; | |
} | |
method set-line-separators(@seps --> Nil) | |
{ | |
say "SEPS: ",@seps.map(*.NFC);@!seps=@seps; | |
} | |
method consume-line-chars(Bool:D :$chomp=False,Bool:D :$eof=False --> Str) | |
{ | |
"CLC $chomp, $eof".say; | |
my ($s,$sep); | |
while !$sep && !$.is-empty | |
{ | |
$s ~= $.consume-exactly-chars(1); | |
for @!seps | |
{ | |
if $s.ends-with: $_ | |
{ | |
$sep=$_; | |
last; | |
} | |
} | |
} | |
if $chomp | |
{ | |
$s.subst-mutate($sep,''); | |
} | |
$s; | |
} | |
method is-empty(--> Bool:D) | |
{ | |
$!bytes.elems < $!position; | |
} | |
method bytes-available(--> Int:D) | |
{ | |
my $r=$!bytes.elems-$!position max 0; | |
"BYTES $!position/$!bytes.elems() -> $r".say;$r; | |
} | |
method consume-exactly-bytes(int $bytes --> Blob) | |
{ | |
"CEB $bytes".say; | |
return Blob if $.bytes-available < $bytes; | |
my $result=$!bytes.subbuf: $!position,$bytes; | |
$!position += $bytes; | |
} | |
} | |
class Encoding::UCS2 does Encoding | |
{ | |
method name(::?CLASS: --> Str:D) | |
{ | |
'iso-10646-ucs-2'; | |
} | |
method alternative-names(::?CLASS: --> List:D) | |
{ | |
'ucs2','ucs-2'; | |
} | |
method encoder(::?CLASS: |args --> Encoding::Encoder:D) | |
{ | |
Encoding::UCS2::Encoder.new(|args); | |
} | |
method decoder(::?CLASS: |args --> Encoding::Decoder:D) | |
{ | |
#Encoding::Registry.find('utf16').decoder(|args); | |
Encoding::UCS2::Decoder.new(|args); | |
} | |
} | |
class Encoding::UCS2BE is Encoding::UCS2 | |
{ | |
method name(::?CLASS: --> Str:D) | |
{ | |
callsame~'-be'; | |
} | |
method alternative-names(::?CLASS: --> List:D) | |
{ | |
callsame.map({|($_~"-be",$_~'be')}).List; | |
} | |
method encoder(::?CLASS: |args --> Encoding::Encoder:D) | |
{ | |
nextwith(|args,:target-endianness<BE>); | |
} | |
} | |
class Encoding::UCS2LE is Encoding::UCS2 | |
{ | |
method name(::?CLASS: --> Str:D) | |
{ | |
callsame~'-le'; | |
} | |
method alternative-names(::?CLASS: --> List:D) | |
{ | |
my @a=callsame.map({|($_~"-le",$_~'le')}); | |
} | |
method encoder(::?CLASS: |args --> Encoding::Encoder:D) | |
{ | |
nextwith(|args,:target-endianness<LE>); | |
} | |
} | |
Encoding::Registry.register(Encoding::UCS2); | |
Encoding::Registry.register(Encoding::UCS2LE); | |
Encoding::Registry.register(Encoding::UCS2BE); | |
Encoding::Registry.find('ucs-2le').say; | |
multi sub MAIN(Str:D :$dir='.') | |
{ | |
my $filepath=$dir.IO.add('out.txt'); | |
say "\t$filepath.absolute()..."; | |
my $h=$filepath.open(:mode<rw>,:enc<ucs-2le>); | |
my $text=$h.slurp;say +$text.comb; | |
$text.say; | |
exit 0; | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment