182 lines
4.0 KiB
Go
182 lines
4.0 KiB
Go
|
package chinese
|
||
|
|
||
|
import (
|
||
|
"context"
|
||
|
"encoding/json"
|
||
|
"fmt"
|
||
|
"strings"
|
||
|
|
||
|
"go-common/library/log"
|
||
|
)
|
||
|
|
||
|
var (
|
||
|
defaultConversion = "s2twp"
|
||
|
)
|
||
|
|
||
|
// Group holds a sequence of dicts
|
||
|
type Group struct {
|
||
|
Files []string
|
||
|
Dicts []*dict
|
||
|
}
|
||
|
|
||
|
func (g *Group) String() string {
|
||
|
return fmt.Sprintf("%+v", g.Files)
|
||
|
}
|
||
|
|
||
|
// OpenCC contains the converter
|
||
|
type openCC struct {
|
||
|
Conversion string
|
||
|
Description string
|
||
|
DictGroup []*Group
|
||
|
}
|
||
|
|
||
|
var conversions = map[string]*openCC{
|
||
|
"s2twp": {Conversion: s2twp},
|
||
|
// "hk2s": {Conversion: hk2s}, "s2hk": {Conversion: s2hk}, "s2t": {Conversion: s2t},
|
||
|
// "s2tw": {Conversion: s2tw}, "t2hk": {Conversion: t2hk},
|
||
|
// "t2s": {Conversion: t2s}, "t2tw": {Conversion: t2tw},
|
||
|
// "tw2s": {Conversion: tw2s}, "tw2sp": {Conversion: tw2sp},
|
||
|
}
|
||
|
|
||
|
// Init construct an instance of OpenCC.
|
||
|
func Init() {
|
||
|
for k, v := range conversions {
|
||
|
if err := v.dict(k); err != nil {
|
||
|
panic(err)
|
||
|
}
|
||
|
}
|
||
|
}
|
||
|
|
||
|
// Converts .
|
||
|
func Converts(ctx context.Context, in ...string) (out map[string]string) {
|
||
|
var err error
|
||
|
out = make(map[string]string, len(in))
|
||
|
for _, v := range in {
|
||
|
if out[v], err = convert(v, defaultConversion); err != nil {
|
||
|
log.Error("convert(%s),err:%+v", in, err)
|
||
|
out[v] = v
|
||
|
}
|
||
|
}
|
||
|
return
|
||
|
}
|
||
|
|
||
|
// Convert string from Simplified Chinese to Traditional Chinese .
|
||
|
func Convert(ctx context.Context, in string) (out string) {
|
||
|
var err error
|
||
|
if out, err = convert(in, defaultConversion); err != nil {
|
||
|
log.Error("convert(%s),err:%+v", in, err)
|
||
|
}
|
||
|
return
|
||
|
}
|
||
|
|
||
|
func (cc *openCC) dict(conversion string) error {
|
||
|
var m interface{}
|
||
|
json.Unmarshal([]byte(cc.Conversion), &m)
|
||
|
config := m.(map[string]interface{})
|
||
|
cc.Description = config["name"].(string)
|
||
|
dictChain, ok := config["conversion_chain"].([]interface{})
|
||
|
if !ok {
|
||
|
return fmt.Errorf("format %+v not correct", config)
|
||
|
}
|
||
|
for _, v := range dictChain {
|
||
|
d, ok := v.(map[string]interface{})
|
||
|
if !ok {
|
||
|
return fmt.Errorf("should be map inside conversion_chain")
|
||
|
}
|
||
|
dictMap, ok := d["dict"]
|
||
|
if !ok {
|
||
|
return fmt.Errorf("should have dict inside conversion_chain")
|
||
|
}
|
||
|
if dict, ok := dictMap.(map[string]interface{}); ok {
|
||
|
group, err := cc.group(dict)
|
||
|
if err != nil {
|
||
|
return err
|
||
|
}
|
||
|
cc.DictGroup = append(cc.DictGroup, group)
|
||
|
}
|
||
|
}
|
||
|
return nil
|
||
|
}
|
||
|
|
||
|
func (cc *openCC) group(d map[string]interface{}) (*Group, error) {
|
||
|
typ, ok := d["type"].(string)
|
||
|
if !ok {
|
||
|
return nil, fmt.Errorf("type should be string")
|
||
|
}
|
||
|
res := &Group{}
|
||
|
switch typ {
|
||
|
case "group":
|
||
|
dicts, ok := d["dicts"].([]interface{})
|
||
|
if !ok {
|
||
|
return nil, fmt.Errorf("dicts field invalid")
|
||
|
}
|
||
|
for _, dict := range dicts {
|
||
|
d, ok := dict.(map[string]interface{})
|
||
|
if !ok {
|
||
|
return nil, fmt.Errorf("dicts items invalid")
|
||
|
}
|
||
|
group, err := cc.group(d)
|
||
|
if err != nil {
|
||
|
return nil, err
|
||
|
}
|
||
|
res.Files = append(res.Files, group.Files...)
|
||
|
res.Dicts = append(res.Dicts, group.Dicts...)
|
||
|
}
|
||
|
case "txt":
|
||
|
file, ok := d["file"]
|
||
|
if !ok {
|
||
|
return nil, fmt.Errorf("no file field found")
|
||
|
}
|
||
|
daDict, err := buildFromFile(file.(string))
|
||
|
if err != nil {
|
||
|
return nil, err
|
||
|
}
|
||
|
res.Files = append(res.Files, file.(string))
|
||
|
res.Dicts = append(res.Dicts, daDict)
|
||
|
default:
|
||
|
return nil, fmt.Errorf("type should be txt or group")
|
||
|
}
|
||
|
return res, nil
|
||
|
}
|
||
|
|
||
|
// convert string from Simplified Chinese to Traditional Chinese or vice versa
|
||
|
func convert(in, conversion string) (string, error) {
|
||
|
if conversion == "" {
|
||
|
conversion = defaultConversion
|
||
|
}
|
||
|
for _, group := range conversions[conversion].DictGroup {
|
||
|
r := []rune(in)
|
||
|
var tokens []string
|
||
|
for i := 0; i < len(r); {
|
||
|
s := r[i:]
|
||
|
var token string
|
||
|
max := 0
|
||
|
for _, dict := range group.Dicts {
|
||
|
ret, err := dict.prefixMatch(string(s))
|
||
|
if err != nil {
|
||
|
return "", err
|
||
|
}
|
||
|
if len(ret) > 0 {
|
||
|
o := ""
|
||
|
for k, v := range ret {
|
||
|
if len(k) > max {
|
||
|
max = len(k)
|
||
|
token = v[0]
|
||
|
o = k
|
||
|
}
|
||
|
}
|
||
|
i += len([]rune(o))
|
||
|
break
|
||
|
}
|
||
|
}
|
||
|
if max == 0 { //no match
|
||
|
token = string(r[i])
|
||
|
i++
|
||
|
}
|
||
|
tokens = append(tokens, token)
|
||
|
}
|
||
|
in = strings.Join(tokens, "")
|
||
|
}
|
||
|
return in, nil
|
||
|
}
|