MobileRead Forums - View Single Post

geek1011 · 10-20-2020, 11:21 AM

Quote:

Originally Posted by xtmpq

Also is there anyway to convert the v3 of dicthtml.zip to stardict?

Firstly, the Kobo ones are encrypted. I will not discuss that topic. Secondly, they're not intended to be used outside Nickel.

If you did happen to want to convert an unencrypted dicthtml to another format, you can use dictutil to decompile it into a dictfile, then PyGlossary to convert that into another format. I haven't fully implemented v3 support yet, but this should work fine in general.

A bit off topic: If you are clever enough, the ones with come with Microsoft Edge (the older, non-chromium, one) are also Oxford, and they can be extracted (they aren't encrypted or protected). They are downloaded to AppData/ProgramData as a sqlite DB, which consists of an index of words to a chunk+shard+offset and a table of chunks and binary data. The binary data consists of a list of compressed JSON separated by headers containing the shard index, size, and some extra junk. Here's some of the code I put together a while ago:

Spoiler:

Code:

package main 

import (
	"bytes"
	"compress/gzip"
	"database/sql"
	"encoding/json"
	"errors"
	"fmt"
	"io/ioutil"
	"os"
	
	_ "github.com/mattn/go-sqlite3"
) 

func queryWord(db *sql.DB, word string) (shardID, idx int, found bool, err error) {
	row := db.QueryRow("SELECT JsonShardID, JsonIndex FROM WordLookup WHERE Name == ?", word)
	if err = row.Scan(&shardID, &idx); err != nil {
		if err == sql.ErrNoRows {
			return 0, 0, false, err
		}
		return 0, 0, false, err
	}
	return shardID, idx, true, nil
}

func listWords(db *sql.DB) ([]string, error) {
	rows, err := db.Query("SELECT Name FROM WordLookup")
	if err != nil {
		return nil, err
	}
	defer rows.Close()

	var words []string
	for rows.Next() {
		var word string
		if err := rows.Scan(&word); err != nil {
			return nil, err
		}
		words = append(words, word)
	}
	if err = rows.Err(); err != nil {
		return nil, err
	}
	return words, nil
}

func getShard(db *sql.DB, shardID int, shardCache map[int][]byte) ([]byte, error) {
	if shardCache != nil {
		if buf, ok := shardCache[shardID]; ok {
			return buf[:], nil
		}
	}

	var buf []byte
	row := db.QueryRow("SELECT ShardData FROM Shard WHERE ID == ?", shardID)
	if err := row.Scan(&buf); err != nil {
		if err == sql.ErrNoRows {
			return nil, err
		}
		return nil, err
	}

	r, err := gzip.NewReader(bytes.NewReader(buf))
	if err != nil {
		return nil, err
	}

	nbuf, err := ioutil.ReadAll(r)
	if err != nil {
		return nil, err
	}

	if shardCache != nil {
		shardCache[shardID] = nbuf[:]
	}

	return nbuf, nil
}

// splitShard splits a shard into its segments.
func splitShard(shard []byte) (map[int][]byte, error) {
	hdr := []byte{0x00, 0x01, 0x00, 0x00, 0x00, 0xFF, 0xFF, 0xFF, 0xFF, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x11, 0x01, 0x00, 0x00, 0x00}
	if !bytes.HasPrefix(shard, hdr) {
		return nil, errors.New("unknown shard header")
	}

	if !bytes.HasSuffix(shard, []byte{0x0B}) {
		return nil, errors.New("unexpected shard end")
	}

	segments := map[int][]byte{}
	// 06 ## 00 00 00 ## ## __...
	//    |           |     ^ content (should start and end with { and })
	//    ^ index     ^ some sort of size
	spl := bytes.Split(shard[:len(shard)-1], []byte{0x06})[1:]
	for i, segment := range spl {
		if len(segment) <= 6 {
			// handle 0x06 in header edge case
			spl[i+1] = append(segment, append([]byte{0x06}, spl[i+1]...)...)
			continue
		}
		if segment[1] != 0x00 || segment[2] != 0x00 || segment[3] != 0x00 {
			return nil, errors.New("unhandled non-zero bytes in shard header")
		}

		start := 6
		if segment[6] != '{' || segment[len(segment)-1] != '}' {
			if segment[7] == '{' {
				start = 7
			} else {
				fmt.Fprintf(os.Stderr, "%#v\n", string(segment)[:20])
				fmt.Fprintf(os.Stderr, "%#v\n", string(segment)[6])
				fmt.Fprintf(os.Stderr, "%#v\n", segment[:20])
				panic("well, it seems like my guess at the header was wrong: invalid json afterwards")
			}
		}

		idx := int(segment[0]) - 2
		if idx < 0 {
			panic("well, it seems like my guess at the header was wrong: the index does not always start at 2")
		}
		if _, seen := segments[idx]; seen {
			panic("well, it seems like my guess at the header was wrong: the second byte is NOT the index")
		}
		segments[idx] = segment[start:]
	}

	return segments, nil
}

// extractIdx extracts the indexed segment from the shard.
//
// Test with:
// - ruffed: last word in file
// - salinency: problematic
// - curvet: slightly different format
// - test
// - example
// - edge
func extractIdx(shard []byte, idx int) ([]byte, error) {
	segments, err := splitShard(shard)
	if err != nil {
		return nil, err
	}
	segment, ok := segments[idx]
	if !ok {
		return nil, errors.New("no such segment")
	}
	return segment, nil
}

type Word struct {
	Name               string
	PhoneticName       string
	Pronunciation      string
	PronunciationAudio struct {
		ContentURL string
	}
	MeaningGroups []struct {
		Meanings []struct {
			// First is the main meaning, next are submeanings
			RichDefinitions []struct {
				Fragments []struct {
					Text string // Is there EVER more than one fragment!?!
				}
				Examples []string
				Synonyms []struct {
					Name string
				}
			}
		}
		WordForms []struct {
			Form string
			Word struct {
				Name string
			}
		}
		PartsOfSpeech []struct {
			Name string
		}
	}
	WordOrigin string
}

func parseWord(buf []byte) (*Word, error) {
	var w Word
	return &w, json.Unmarshal(buf, &w)
}