Wizard
Posts: 2,806
Karma: 7423683
Join Date: May 2016
Location: Ontario, Canada
Device: Kobo Mini, Aura Edition 2 v1, Clara HD
|
Quote:
Originally Posted by xtmpq
Also is there anyway to convert the v3 of dicthtml.zip to stardict?
|
Firstly, the Kobo ones are encrypted. I will not discuss that topic. Secondly, they're not intended to be used outside Nickel.
If you did happen to want to convert an unencrypted dicthtml to another format, you can use dictutil to decompile it into a dictfile, then PyGlossary to convert that into another format. I haven't fully implemented v3 support yet, but this should work fine in general.
A bit off topic: If you are clever enough, the ones with come with Microsoft Edge (the older, non-chromium, one) are also Oxford, and they can be extracted (they aren't encrypted or protected). They are downloaded to AppData/ProgramData as a sqlite DB, which consists of an index of words to a chunk+shard+offset and a table of chunks and binary data. The binary data consists of a list of compressed JSON separated by headers containing the shard index, size, and some extra junk. Here's some of the code I put together a while ago: Spoiler:
Code:
package main
import (
"bytes"
"compress/gzip"
"database/sql"
"encoding/json"
"errors"
"fmt"
"io/ioutil"
"os"
_ "github.com/mattn/go-sqlite3"
)
func queryWord(db *sql.DB, word string) (shardID, idx int, found bool, err error) {
row := db.QueryRow("SELECT JsonShardID, JsonIndex FROM WordLookup WHERE Name == ?", word)
if err = row.Scan(&shardID, &idx); err != nil {
if err == sql.ErrNoRows {
return 0, 0, false, err
}
return 0, 0, false, err
}
return shardID, idx, true, nil
}
func listWords(db *sql.DB) ([]string, error) {
rows, err := db.Query("SELECT Name FROM WordLookup")
if err != nil {
return nil, err
}
defer rows.Close()
var words []string
for rows.Next() {
var word string
if err := rows.Scan(&word); err != nil {
return nil, err
}
words = append(words, word)
}
if err = rows.Err(); err != nil {
return nil, err
}
return words, nil
}
func getShard(db *sql.DB, shardID int, shardCache map[int][]byte) ([]byte, error) {
if shardCache != nil {
if buf, ok := shardCache[shardID]; ok {
return buf[:], nil
}
}
var buf []byte
row := db.QueryRow("SELECT ShardData FROM Shard WHERE ID == ?", shardID)
if err := row.Scan(&buf); err != nil {
if err == sql.ErrNoRows {
return nil, err
}
return nil, err
}
r, err := gzip.NewReader(bytes.NewReader(buf))
if err != nil {
return nil, err
}
nbuf, err := ioutil.ReadAll(r)
if err != nil {
return nil, err
}
if shardCache != nil {
shardCache[shardID] = nbuf[:]
}
return nbuf, nil
}
// splitShard splits a shard into its segments.
func splitShard(shard []byte) (map[int][]byte, error) {
hdr := []byte{0x00, 0x01, 0x00, 0x00, 0x00, 0xFF, 0xFF, 0xFF, 0xFF, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x11, 0x01, 0x00, 0x00, 0x00}
if !bytes.HasPrefix(shard, hdr) {
return nil, errors.New("unknown shard header")
}
if !bytes.HasSuffix(shard, []byte{0x0B}) {
return nil, errors.New("unexpected shard end")
}
segments := map[int][]byte{}
// 06 ## 00 00 00 ## ## __...
// | | ^ content (should start and end with { and })
// ^ index ^ some sort of size
spl := bytes.Split(shard[:len(shard)-1], []byte{0x06})[1:]
for i, segment := range spl {
if len(segment) <= 6 {
// handle 0x06 in header edge case
spl[i+1] = append(segment, append([]byte{0x06}, spl[i+1]...)...)
continue
}
if segment[1] != 0x00 || segment[2] != 0x00 || segment[3] != 0x00 {
return nil, errors.New("unhandled non-zero bytes in shard header")
}
start := 6
if segment[6] != '{' || segment[len(segment)-1] != '}' {
if segment[7] == '{' {
start = 7
} else {
fmt.Fprintf(os.Stderr, "%#v\n", string(segment)[:20])
fmt.Fprintf(os.Stderr, "%#v\n", string(segment)[6])
fmt.Fprintf(os.Stderr, "%#v\n", segment[:20])
panic("well, it seems like my guess at the header was wrong: invalid json afterwards")
}
}
idx := int(segment[0]) - 2
if idx < 0 {
panic("well, it seems like my guess at the header was wrong: the index does not always start at 2")
}
if _, seen := segments[idx]; seen {
panic("well, it seems like my guess at the header was wrong: the second byte is NOT the index")
}
segments[idx] = segment[start:]
}
return segments, nil
}
// extractIdx extracts the indexed segment from the shard.
//
// Test with:
// - ruffed: last word in file
// - salinency: problematic
// - curvet: slightly different format
// - test
// - example
// - edge
func extractIdx(shard []byte, idx int) ([]byte, error) {
segments, err := splitShard(shard)
if err != nil {
return nil, err
}
segment, ok := segments[idx]
if !ok {
return nil, errors.New("no such segment")
}
return segment, nil
}
type Word struct {
Name string
PhoneticName string
Pronunciation string
PronunciationAudio struct {
ContentURL string
}
MeaningGroups []struct {
Meanings []struct {
// First is the main meaning, next are submeanings
RichDefinitions []struct {
Fragments []struct {
Text string // Is there EVER more than one fragment!?!
}
Examples []string
Synonyms []struct {
Name string
}
}
}
WordForms []struct {
Form string
Word struct {
Name string
}
}
PartsOfSpeech []struct {
Name string
}
}
WordOrigin string
}
func parseWord(buf []byte) (*Word, error) {
var w Word
return &w, json.Unmarshal(buf, &w)
}
Last edited by geek1011; 10-20-2020 at 11:33 AM.
|