feat(binana): add tokens database
Some checks failed
Push / build (push) Has been cancelled

This commit is contained in:
phaneron 2026-03-20 01:58:16 -04:00
parent ac268a16c8
commit 2c2815ab0b
22 changed files with 2122 additions and 2 deletions

View file

@ -0,0 +1,14 @@
package dbutil
import "errors"
type DatabaseFormat uint8
const (
DatabaseParquet DatabaseFormat = iota
DatabaseJSON
)
var (
ErrUnknownDatabaseFormat = errors.New("dbutil: unknown database format")
)

View file

@ -0,0 +1,80 @@
package dbutil
import (
"encoding/json"
"fmt"
"io"
"os"
"github.com/parquet-go/parquet-go"
)
type Writer[T any] struct {
write func([]T) (err error)
close func() (err error)
}
func (writer *Writer[T]) WriteEntries(entries []T) (err error) {
err = writer.write(entries)
return
}
func (writer *Writer[T]) Close() (err error) {
err = writer.close()
return
}
func Open[T any](name string, format DatabaseFormat) (writer *Writer[T], err error) {
writer = new(Writer[T])
var (
file *os.File
output io.Writer
)
if name == "" {
output = os.Stdout
} else {
file, err = os.Create(name)
if err != nil {
return
}
output = file
}
switch format {
case DatabaseJSON:
encoder := json.NewEncoder(output)
writer.write = func(entries []T) (err error) {
for _, entry := range entries {
if err = encoder.Encode(&entry); err != nil {
return
}
}
return
}
writer.close = func() (err error) {
if file != nil {
err = file.Close()
}
return
}
case DatabaseParquet:
generic_writer := parquet.NewGenericWriter[T](output)
writer.write = func(entries []T) (err error) {
_, err = generic_writer.Write(entries)
return
}
writer.close = func() (err error) {
if err = generic_writer.Close(); err != nil {
return
}
if file != nil {
err = file.Close()
}
return
}
default:
err = fmt.Errorf("%w: %d", ErrUnknownDatabaseFormat, format)
}
return
}

49
go/app/util/demangle.go Normal file
View file

@ -0,0 +1,49 @@
package util
import (
"bytes"
"fmt"
"os/exec"
"strings"
)
var demangle_cache = make(map[string]string)
func demangle(str string) (demangled string, err error) {
var ok bool
if demangled, ok = demangle_cache[str]; ok {
return
}
var output bytes.Buffer
c := exec.Command("demumble", str)
c.Stdout = &output
c.Run()
if output.Len() == 0 {
err = fmt.Errorf("cannot demangle")
return
}
demangled = output.String()
demangled = strings.TrimSuffix(demangled, "\n")
if str == demangled {
err = fmt.Errorf("cannot demangle")
return
}
demangle_cache[str] = demangled
return
}
func looks_mangled(str string) bool {
if strings.HasPrefix(str, ".") {
return true
} else if strings.HasPrefix(str, "?") {
return true
} else if strings.HasPrefix(str, "_Z") {
return true
} else if len(str) > 0 && str[0] >= '0' && str[0] <= '9' {
return true
} else if strings.ContainsAny(str, "0123456789") && !strings.Contains(str, " ") {
return true
}
return false
}

31
go/app/util/exe.go Normal file
View file

@ -0,0 +1,31 @@
package util
import (
"debug/pe"
"os"
)
func get_exe_base_address(name string) (base_address uint64, err error) {
var file *os.File
file, err = os.Open(name)
if err != nil {
return
}
var (
pe_file *pe.File
)
pe_file, err = pe.NewFile(file)
if err != nil {
return
}
base_address = uint64(0x400000)
switch h := pe_file.OptionalHeader.(type) {
case *pe.OptionalHeader32:
base_address = uint64(h.ImageBase)
case *pe.OptionalHeader64:
base_address = h.ImageBase
}
file.Close()
return
}

19
go/app/util/hash.go Normal file
View file

@ -0,0 +1,19 @@
package util
import (
"crypto/sha256"
"encoding/hex"
"os"
)
func hash_file(name string) (id string, err error) {
var b []byte
b, err = os.ReadFile(name)
if err != nil {
return
}
h := sha256.New()
h.Write(b[:])
id = hex.EncodeToString(h.Sum(nil))
return
}

224
go/app/util/make-samples.go Normal file
View file

@ -0,0 +1,224 @@
package util
import (
"bufio"
"bytes"
"fmt"
"os"
"os/exec"
"path/filepath"
"strconv"
"strings"
"github.com/thunderbrewhq/binana/go/app"
"github.com/thunderbrewhq/binana/go/app/util/dbutil"
"github.com/thunderbrewhq/binana/go/db"
)
type MakeSampleDatabaseParams struct {
// A file name that corresponds to a tree of sample files.
// Anything in this tree will be collected into the sample database
Source string
// The name of the file to write the database to
Output string
// Sets the format of the database file
Format dbutil.DatabaseFormat
// URLs that maps to the root of the sample tree hierarchy.
// Used to generate a list of mirror URLs for sample binaries
DirectMirrors []string
// List of IPFS Gateway URLs
// If not empty, a CID for the sample tree will be created,
// Actually uploading anything in the sample tree, however,
// is outside the scope of this tool
IPFSGateways []string
}
type sample_database struct {
writer *dbutil.Writer[db.Sample]
ipfs_tree_cid string
buffer []db.Sample
}
func (sample_database *sample_database) add(sample db.Sample) (err error) {
sample_database.buffer = append(sample_database.buffer, sample)
return
}
func (sample_database *sample_database) Close() (err error) {
if err = sample_database.writer.WriteEntries(sample_database.buffer); err != nil {
return
}
err = sample_database.writer.Close()
return
}
func (sample_database *sample_database) make_sample_file(params *MakeSampleDatabaseParams, name, relative_name string) (err error) {
var sample db.Sample
// infer mime-type from extension
switch filepath.Ext(name) {
case ".exe":
sample.MimeType = "application/vnd.microsoft.portable-executable"
case ".pdb":
sample.MimeType = "application/x-ms-pdb"
// associate the PDB with its EXE
sample_exe_name := strings.TrimSuffix(name, ".pdb") + ".exe"
if _, err = os.Stat(sample_exe_name); err == nil {
sample.Executable, err = hash_file(sample_exe_name)
if err != nil {
panic(err)
}
}
case ".macho":
sample.MimeType = "application/x-mach-binary"
case ".elf":
sample.MimeType = "application/x-executable"
default:
// don't care about this
return
}
sample.ID, err = hash_file(name)
if err != nil {
panic(err)
}
// get the base filename
base_name := filepath.Base(name)
// split the base filename without its extension
filename_components := strings.Split(strings.TrimSuffix(base_name, filepath.Ext(base_name)), "-")
// now, parse the filename (these must be correctly named!)
sample.Program = filename_components[0]
sample.Version = filename_components[1]
var build uint64
build, err = strconv.ParseUint(filename_components[2], 0, 64)
if err != nil {
panic(err)
}
sample.Build = uint32(build)
sample.OS = filename_components[3]
sample.Arch = filename_components[4]
// now, create various mirrors
for _, direct_mirror := range params.DirectMirrors {
sample.Mirrors = append(sample.Mirrors, db.SampleMirror{
Kind: db.MirrorDirect,
URL: direct_mirror + relative_name,
})
}
for _, ipfs_gateway := range params.IPFSGateways {
sample.Mirrors = append(sample.Mirrors, db.SampleMirror{
Kind: db.MirrorIPFS,
URL: ipfs_gateway + "/" + sample_database.ipfs_tree_cid + relative_name,
})
}
// now write the sample
if err = sample_database.add(sample); err != nil {
return
}
return
}
func (sample_database *sample_database) make_tree(params *MakeSampleDatabaseParams, name, relative_name string) (err error) {
var (
tree_entries []os.DirEntry
)
tree_entries, err = os.ReadDir(name)
if err != nil {
return
}
for _, tree_entry := range tree_entries {
if tree_entry.IsDir() {
if err = sample_database.make_tree(params, name+"/"+tree_entry.Name(), relative_name+"/"+tree_entry.Name()); err != nil {
return
}
} else {
if err = sample_database.make_sample_file(params, name+"/"+tree_entry.Name(), relative_name+"/"+tree_entry.Name()); err != nil {
return
}
}
}
return
}
func ipfs_generate_file_cid(name string) (cid string, err error) {
// todo
// use command:
// ipfs add -qr --only-hash .
// inside the root of the sample tree
// the last CID is the root of the tree
var (
wd string
)
wd, err = os.Getwd()
if err != nil {
return
}
err = os.Chdir(name)
if err != nil {
return
}
command := exec.Command("ipfs", "add", "-qr", "--only-hash", ".")
var command_output bytes.Buffer
command.Stdout = &command_output
command.Run()
if command.ProcessState.ExitCode() != 0 {
os.Chdir(wd)
err = fmt.Errorf("util: ipfs tool exited: %d", command.ProcessState.ExitCode())
return
}
// Parse command Output
command_output_scanner := bufio.NewScanner(&command_output)
for command_output_scanner.Scan() {
cid = command_output_scanner.Text()
}
err = os.Chdir(wd)
return
}
func MakeSampleDatabase(params *MakeSampleDatabaseParams) {
var (
err error
sample_database sample_database
)
// if we want to generate IPFS links, start by getting the CID for the sample tree
if len(params.IPFSGateways) != 0 {
sample_database.ipfs_tree_cid, err = ipfs_generate_file_cid(params.Source)
if err != nil {
app.Fatal(err)
return
}
}
sample_database.writer, err = dbutil.Open[db.Sample](params.Output, params.Format)
if err != nil {
app.Fatal(err)
}
// make the root tree, with our params, the source as the first tree, and "" (root) as the relative path
if err = sample_database.make_tree(params, params.Source, ""); err != nil {
app.Fatal(err)
}
if err = sample_database.Close(); err != nil {
app.Fatal(err)
}
}

303
go/app/util/make-tokens.go Normal file
View file

@ -0,0 +1,303 @@
package util
import (
"compress/gzip"
"debug/macho"
"encoding/json"
"fmt"
"os"
"path/filepath"
"slices"
"strings"
"time"
"github.com/thunderbrewhq/binana/go/app"
"github.com/thunderbrewhq/binana/go/app/util/dbutil"
"github.com/thunderbrewhq/binana/go/db"
"github.com/thunderbrewhq/binana/go/pdbconv"
"github.com/thunderbrewhq/binana/go/stringrecovery"
)
type MakeTokenDatabaseParams struct {
Source string
Output string
Format dbutil.DatabaseFormat
}
func MakeTokenDatabase(params *MakeTokenDatabaseParams) {
var (
tokens_database tokens_database
err error
)
if err = tokens_database.Open(params.Output, params.Format); err != nil {
app.Fatal(err)
}
if err = tokens_database.make(params.Source); err != nil {
app.Fatal(err)
}
if err = tokens_database.Close(); err != nil {
app.Fatal(err)
}
}
type tokens_database struct {
sequence uint64
writer *dbutil.Writer[db.Token]
}
func (tokens_database *tokens_database) next_token_id() (id uint64) {
id = tokens_database.sequence
tokens_database.sequence++
return
}
func (tokens_database *tokens_database) Open(name string, format dbutil.DatabaseFormat) (err error) {
tokens_database.sequence = 1
tokens_database.writer, err = dbutil.Open[db.Token](name, format)
return
}
func (tokens_database *tokens_database) Close() (err error) {
err = tokens_database.writer.Close()
return
}
func (tokens_database *tokens_database) Write(token *db.Token) (err error) {
tokens := []db.Token{*token}
if err = tokens_database.writer.WriteEntries(tokens); err != nil {
return
}
return
}
func (tokens_database *tokens_database) make_file_pdb(name string) (err error) {
exe_name := strings.TrimSuffix(name, ".pdb") + ".exe"
var base_address uint64
base_address, err = get_exe_base_address(exe_name)
if err != nil {
return
}
fmt.Fprintln(os.Stderr, "[pdb]", name)
var source_id string
source_id, err = hash_file(name)
if err != nil {
return
}
fmt.Fprintln(os.Stderr, "[pdb]", source_id)
// check for the existence of an alternate, .pdb.json.gz file
_, err = os.Stat(name + ".json.gz")
if err != nil {
return
}
var (
gzip_file *os.File
gzip_reader *gzip.Reader
)
gzip_file, err = os.Open(name + ".json.gz")
if err != nil {
return
}
var pdb pdbconv.ProgramDatabase
gzip_reader, err = gzip.NewReader(gzip_file)
json_decoder := json.NewDecoder(gzip_reader)
if err = json_decoder.Decode(&pdb); err != nil {
return
}
gzip_file.Close()
var v pdb_token_visitor
v.init(tokens_database, source_id, base_address)
if err = v.visit_all(&pdb); err != nil {
return
}
if err = v.write_tokens(); err != nil {
return
}
return
}
func (tokens_database *tokens_database) write_string_token(source_id string, section_name string, address uint64, str string) (err error) {
var db_token db.Token
db_token.ID = tokens_database.next_token_id()
db_token.Source = source_id
db_token.Section = section_name
db_token.Kind = db.OriginalStringToken
db_token.Offset = fmt.Sprintf("%X", address)
db_token.Names = append(db_token.Names, db.TokenName{db.OriginalName, str})
// detect if this is a mangled type identifier
if looks_mangled(str) {
demangled, err := demangle(str)
if err == nil {
db_token.Names = append(db_token.Names, db.TokenName{db.DemangledName, demangled})
}
}
err = tokens_database.Write(&db_token)
return
}
func (tokens_database *tokens_database) make_file_pe(name string) (err error) {
var id string
id, err = hash_file(name)
if err != nil {
return
}
err = stringrecovery.RecoverFile(name, func(section_name string, address uint64, str string) {
fmt.Fprintf(os.Stderr, "[pe] string found: %s %08X %s\n", section_name, address, str)
tokens_database.write_string_token(id, section_name, address, str)
})
return
}
func (tokens_database *tokens_database) make_file_macho(name string) (err error) {
fmt.Fprintln(os.Stderr, "[mach-o]", name)
var source_id string
source_id, err = hash_file(name)
if err != nil {
return
}
fmt.Fprintln(os.Stderr, "[mach-o]", source_id)
var (
file *os.File
macho_file *macho.File
)
file, err = os.Open(name)
if err != nil {
return
}
macho_file, err = macho.NewFile(file)
if err != nil {
return
}
_, dwarf_err := macho_file.DWARF()
if dwarf_err == nil {
fmt.Fprintln(os.Stderr, "DWARF!")
time.Sleep(5 * time.Second)
}
fmt.Fprintln(os.Stderr, "[mach-o]", "cpu", macho_file.FileHeader.Cpu)
fmt.Fprintln(os.Stderr, "[mach-o]", "loads:")
// for _, load := range macho_file.Loads {
// fmt.Fprintln(os.Stderr, "[mach-o]", load.String())
// }
fmt.Fprintln(os.Stderr, "[mach-o]", "sections:")
for _, section := range macho_file.Sections {
fmt.Fprintln(os.Stderr, "section", section.SectionHeader.Name)
}
if macho_file.Dysymtab != nil {
fmt.Fprintln(os.Stderr, "[mach-o]", "does not contain a dysymtab")
} else {
fmt.Fprintln(os.Stderr, "[mach-o]", "dysymtab:")
}
var imported_symbols []string
imported_symbols, err = macho_file.ImportedSymbols()
if err != nil {
return
}
if macho_file.Symtab == nil {
fmt.Fprintln(os.Stderr, "[mach-o]", "does not contain a symtab")
} else {
fmt.Fprintln(os.Stderr, "[mach-o]", "symtab:")
for _, sym := range macho_file.Symtab.Syms {
imported := slices.Contains(imported_symbols, sym.Name)
var section_name string
if sym.Sect != 0 {
section_name = macho_file.Sections[sym.Sect-1].SectionHeader.Name
}
if imported {
fmt.Fprintf(os.Stderr, "[mach-o] imported %s %02x %s\n", section_name, sym.Type, sym.Name)
} else {
if sym.Name == "" {
// fmt.Fprintln(os.Stderr, "[mach-o]", "symbol has no name", "sect="+section_name, sym.Type, sym.Value, sym.Desc)
} else {
fmt.Fprintf(os.Stderr, "[mach-o] internal %s %02x %s\n", section_name, sym.Type, sym.Name)
var token db.Token
token.ID = tokens_database.next_token_id()
token.Source = source_id
token.Kind = db.OriginalSymbolToken
token.Section = section_name
token.Offset = fmt.Sprintf("%X", sym.Value)
token.Names = append(token.Names, db.TokenName{db.OriginalName, sym.Name})
if looks_mangled(sym.Name) {
demangled, err := demangle(sym.Name)
if err == nil {
token.Names = append(token.Names, db.TokenName{db.DemangledName, demangled})
}
}
tokens_database.Write(&token)
}
}
}
}
file.Close()
if err = stringrecovery.RecoverFile(name, func(section_name string, address uint64, str string) {
fmt.Fprintf(os.Stderr, "[mach-o] string found: %s %08X %s\n", section_name, address, str)
tokens_database.write_string_token(source_id, section_name, address, str)
}); err != nil {
return
}
return
}
func (tokens_database *tokens_database) make_file(name string) (err error) {
switch filepath.Ext(name) {
case ".macho":
err = tokens_database.make_file_macho(name)
case ".pdb":
err = tokens_database.make_file_pdb(name)
case ".exe":
err = tokens_database.make_file_pe(name)
}
return
}
func (tokens_database *tokens_database) make_directory(name string) (err error) {
var entries []os.DirEntry
entries, err = os.ReadDir(name)
if err != nil {
return
}
for _, entry := range entries {
if entry.IsDir() {
if err = tokens_database.make_directory(filepath.Join(name, entry.Name())); err != nil {
return
}
} else {
if err = tokens_database.make_file(filepath.Join(name, entry.Name())); err != nil {
return
}
}
}
return
}
func (tokens_database *tokens_database) make(name string) (err error) {
var fi os.FileInfo
fi, err = os.Stat(name)
if err != nil {
return
}
if fi.IsDir() {
err = tokens_database.make_directory(name)
} else {
err = tokens_database.make_file(name)
}
return
}

474
go/app/util/pdb.go Normal file
View file

@ -0,0 +1,474 @@
package util
import (
"crypto/sha256"
"encoding/hex"
"fmt"
"maps"
"slices"
"sort"
"strconv"
"github.com/thunderbrewhq/binana/go/db"
"github.com/thunderbrewhq/binana/go/pdbconv"
)
type pdb_token_visitor struct {
tokens_database *tokens_database
// location of the base module
base_address uint64
pdb_source_id string
strings map[string]*db.Token
datatypes map[string]*db.Token
// maps a symbol to a token
symbols map[uint64]*db.Token
constants []db.Token
}
func (v *pdb_token_visitor) init(tokens_database *tokens_database, pdb_source_id string, base_address uint64) {
v.base_address = base_address
v.pdb_source_id = pdb_source_id
v.strings = make(map[string]*db.Token)
v.datatypes = make(map[string]*db.Token)
// map of address to token
v.symbols = make(map[uint64]*db.Token)
v.tokens_database = tokens_database
}
func (v *pdb_token_visitor) visit_class(class *pdbconv.Class) (err error) {
token, ok := v.datatypes[class.Name]
if !ok {
token = new(db.Token)
v.datatypes[class.Name] = token
}
// set token source to pdb
token.Source = v.pdb_source_id
// kind is debug information token
token.Kind = db.OriginalDatatypeToken
// set original name
token.Names = append(token.Names, db.TokenName{db.OriginalName, class.Name})
// set the basic type
token.Keyword = "class"
for _, member := range class.Members {
var token_member db.TokenMember
if member.Kind == "Member" {
token_member.Kind = db.FieldMember
} else if member.Kind == "Unknown" && member.Datatype == "void *" {
token_member.Kind = db.MethodMember
} else {
continue
}
token_member.Key = member.Name
token_member.Value = member.Datatype
if !slices.Contains(token.Members, token_member) {
token.Members = append(token.Members, token_member)
}
}
return
}
func (v *pdb_token_visitor) visit_datatype(datatype *pdbconv.Datatype) (err error) {
token, ok := v.datatypes[datatype.Name]
if !ok {
token = new(db.Token)
v.datatypes[datatype.Name] = token
}
// set token source to pdb
token.Source = v.pdb_source_id
// kind is debug information token
token.Kind = db.OriginalDatatypeToken
// set original name
token.Names = append(token.Names, db.TokenName{db.OriginalName, datatype.Name})
// set the basic type
if datatype.Kind == "Structure" {
token.Keyword = "struct"
} else if datatype.Kind == "Union" {
token.Keyword = "union"
} else {
err = fmt.Errorf("unhandled datatype kind '%s'", datatype.Kind)
return
}
for _, member := range datatype.Members {
var token_member db.TokenMember
if member.Kind == "Member" {
token_member.Kind = db.FieldMember
} else if member.Kind == "Unknown" && member.Datatype == "void *" {
token_member.Kind = db.MethodMember
} else {
continue
}
token_member.Key = member.Name
token_member.Value = member.Datatype
if !slices.Contains(token.Members, token_member) {
token.Members = append(token.Members, token_member)
}
}
return
}
func (v *pdb_token_visitor) visit_function(function *pdbconv.Function) (err error) {
var address uint64
address, err = strconv.ParseUint(function.Address, 0, 64)
if err != nil {
return
}
address = v.base_address + address
token, ok := v.symbols[address]
if !ok {
token = new(db.Token)
v.symbols[address] = token
}
// set token source to pdb
token.Source = v.pdb_source_id
// kind is symbol information token
token.Kind = db.OriginalSymbolToken
// set address
token.Offset = fmt.Sprintf("%X", address)
// set original name
token.Names = append(token.Names, db.TokenName{db.OriginalName, function.Name})
if looks_mangled(function.Name) {
demangled, demangler_err := demangle(function.Name)
if demangler_err == nil {
token.Names = append(token.Names, db.TokenName{db.DemangledName, demangled})
}
}
// visit source files
for _, line_number := range function.LineNumbers {
if err = v.visit_string(line_number.SourceFile); err != nil {
return
}
}
// classify stack variables as members
for _, stack_variable := range function.StackVariables {
var token_member db.TokenMember
if stack_variable.Kind == "Parameter" || stack_variable.Kind == "ObjectPointer" {
token_member.Kind = db.ParameterMember
} else if stack_variable.Kind == "Local" {
token_member.Kind = db.LocalMember
} else if stack_variable.Kind == "StaticLocal" {
token_member.Kind = db.StaticLocalMember
} else if stack_variable.Kind == "Constant" {
// these are repeated elsewhere
continue
} else {
err = fmt.Errorf("unhandled stack variable kind '%s'", stack_variable.Kind)
return
}
token_member.Key = stack_variable.Name
token_member.Value = stack_variable.Datatype
token.Members = append(token.Members, token_member)
}
return
}
func (v *pdb_token_visitor) visit_enum(enum *pdbconv.Enum) (err error) {
// create hash of enum's contents and use to key the datatype
h := sha256.New()
h.Write([]byte(enum.Name))
for _, member := range enum.Members {
h.Write([]byte(member.Name))
h.Write([]byte(fmt.Sprintf("%d", member.Value)))
}
name := hex.EncodeToString(h.Sum(nil))
token, ok := v.datatypes[name]
if !ok {
token = new(db.Token)
v.datatypes[name] = token
}
token.Source = v.pdb_source_id
token.Keyword = "enum"
// apply name (may be __unnamed)
token.Names = append(token.Names, db.TokenName{db.OriginalName, enum.Name})
// this is an original datatype
token.Kind = db.OriginalDatatypeToken
for _, member := range enum.Members {
var token_member db.TokenMember
token_member.Kind = db.EnumMember
token_member.Key = member.Name
token_member.Value = fmt.Sprintf("%d", member.Value)
if !slices.Contains(token.Members, token_member) {
token.Members = append(token.Members, token_member)
}
}
return
}
func (v *pdb_token_visitor) visit_string(s string) (err error) {
token, ok := v.strings[s]
if !ok {
token = new(db.Token)
v.strings[s] = token
}
// apply source
token.Source = v.pdb_source_id
// this is a string token
token.Kind = db.OriginalStringToken
// add name
var token_name db.TokenName
token_name.Kind = db.OriginalName
token_name.Name = s
token.Names = []db.TokenName{token_name}
return
}
func (v *pdb_token_visitor) visit_source_files_table(table *pdbconv.Table) (err error) {
for _, source_file := range table.SourceFiles {
if err = v.visit_string(source_file.Name); err != nil {
return
}
}
return
}
func (v *pdb_token_visitor) visit_constant(symbol *pdbconv.TableSymbol) (err error) {
var token db.Token
token.ID = v.tokens_database.next_token_id()
token.Source = v.pdb_source_id
token.Keyword = "const"
token.Datatype = symbol.Datatype
if symbol.Name != "" {
var name db.TokenName
name.Kind = db.OriginalName
name.Name = symbol.Name
token.Names = append(token.Names, name)
}
if symbol.Undecorated != "" {
var name db.TokenName
name.Kind = db.OriginalName
name.Name = symbol.Undecorated
token.Names = append(token.Names, name)
}
var value db.TokenMember
value.Kind = db.ConstantValueMember
value.Value = symbol.Value
token.Members = append(token.Members, value)
v.constants = append(v.constants, token)
return
}
func (v *pdb_token_visitor) visit_table_symbol(symbol *pdbconv.TableSymbol) (err error) {
if symbol.Address == "0x0" {
if symbol.Value != "" && symbol.Kind == "Constant" {
err = v.visit_constant(symbol)
return
}
// so, this does not correspond to an actual symbol.
// we can still mine it for string tokens.
if symbol.Name != "" {
if err = v.visit_string(symbol.Name); err != nil {
return
}
}
if symbol.Undecorated != "" {
if err = v.visit_string(symbol.Name); err != nil {
return
}
}
return
}
// this corresponds to an address
// compute the real address
var address uint64
address, err = strconv.ParseUint(symbol.Address, 0, 64)
if err != nil {
return
}
address = v.base_address + address
token, ok := v.symbols[address]
if !ok {
token = new(db.Token)
v.symbols[address] = token
}
token.Source = v.pdb_source_id
if symbol.Datatype != "" {
token.Datatype = symbol.Datatype
}
if symbol.Kind == "FileStatic" {
token.Keyword = "static"
} else if symbol.Kind == "Global" {
token.Keyword = "global"
}
if symbol.Name != "" {
token_name := db.TokenName{db.OriginalName, symbol.Name}
if !slices.Contains(token.Names, token_name) {
token.Names = append(token.Names, token_name)
}
}
if symbol.Undecorated != "" {
undecorated := db.TokenName{db.OriginalName, symbol.Undecorated}
if !slices.Contains(token.Names, undecorated) {
token.Names = append(token.Names, undecorated)
}
}
return
}
func (v *pdb_token_visitor) visit_symbols_table(table *pdbconv.Table) (err error) {
for _, symbol := range table.Symbols {
if err = v.visit_table_symbol(&symbol); err != nil {
return
}
}
return
}
func (v *pdb_token_visitor) visit_table(table *pdbconv.Table) (err error) {
if table.Name == "SourceFiles" {
err = v.visit_source_files_table(table)
} else if table.Name == "Symbols" {
err = v.visit_symbols_table(table)
}
return
}
func (v *pdb_token_visitor) visit_typedef(typedef *pdbconv.Typedef) (err error) {
token, ok := v.datatypes[typedef.Name]
if !ok {
token = new(db.Token)
v.datatypes[typedef.Name] = token
}
token.Source = v.pdb_source_id
token.Kind = db.OriginalDatatypeToken
var token_name db.TokenName
token_name.Kind = db.OriginalName
token_name.Name = typedef.Name
if !slices.Contains(token.Names, token_name) {
token.Names = append(token.Names, token_name)
}
token.Datatype = typedef.Basetype
return
}
func (v *pdb_token_visitor) visit_all(pdb *pdbconv.ProgramDatabase) (err error) {
for _, class := range pdb.Classes {
if err = v.visit_class(&class); err != nil {
return
}
}
for _, datatype := range pdb.Datatypes {
if err = v.visit_datatype(&datatype); err != nil {
return
}
}
for _, enum := range pdb.Enums {
if err = v.visit_enum(&enum); err != nil {
return
}
}
for _, function := range pdb.Functions {
if err = v.visit_function(&function); err != nil {
return
}
}
for _, table := range pdb.Tables {
if err = v.visit_table(&table); err != nil {
return
}
}
for _, typedef := range pdb.Typedefs {
if err = v.visit_typedef(&typedef); err != nil {
return
}
}
return
}
func (v *pdb_token_visitor) write_tokens() (err error) {
datatypes := slices.Collect(maps.Keys(v.datatypes))
sort.Strings(datatypes)
symbols := slices.Collect(maps.Keys(v.symbols))
slices.SortFunc(symbols, func(a, b uint64) int {
if a < b {
return -1
} else if a == b {
return 0
}
return 1
})
strings := slices.Collect(maps.Keys(v.strings))
sort.Strings(strings)
for _, datatype := range datatypes {
if err = v.tokens_database.Write(v.datatypes[datatype]); err != nil {
return
}
}
for _, symbol := range symbols {
if err = v.tokens_database.Write(v.symbols[symbol]); err != nil {
return
}
}
for _, string := range strings {
if err = v.tokens_database.Write(v.strings[string]); err != nil {
return
}
}
for _, constant := range v.constants {
if err = v.tokens_database.Write(&constant); err != nil {
return
}
}
return
}

189
go/app/util/query.go Normal file
View file

@ -0,0 +1,189 @@
package util
import (
"encoding/json"
"fmt"
"os"
"regexp"
"slices"
"github.com/parquet-go/parquet-go"
"github.com/thunderbrewhq/binana/go/app"
"github.com/thunderbrewhq/binana/go/db"
)
type QueryPresentationMode uint8
const (
PresentQueryNormal QueryPresentationMode = iota
PresentQueryNameOnly
)
type QueryParams struct {
//
Present QueryPresentationMode
// Match pattern for profile
Profile string
// Possible values for Program
Program []string
// Possible values for OS
OS []string
// Possible values for arch
Arch []string
// Range of builds to return information for
MinBuild uint32
MaxBuild uint32
// Regular expression for tokens (symbols/type information)
Token string
}
type token_query struct {
params *QueryParams
sample_database map[string]db.Sample
token_regexp *regexp.Regexp
}
func (token_query *token_query) present_token(token *db.Token) {
if token_query.params.Present == PresentQueryNameOnly {
for _, name := range token.Names {
if token_query.token_regexp.MatchString(name.Name) {
fmt.Println(name.Name)
}
}
return
}
kind_name := ""
switch token.Kind {
case db.OriginalConstantToken:
kind_name = "original constant"
case db.OriginalDatatypeToken:
kind_name = "original datatype"
case db.OriginalStringToken:
kind_name = "original string"
case db.OriginalSymbolToken:
kind_name = "original symbol"
default:
return
}
fmt.Printf("%s in sample: '%s' section: '%s'", kind_name, token.Source[:8], token.Section)
if token.Offset != "" {
fmt.Printf(" at %s", token.Offset)
}
if token.Datatype != "" {
fmt.Printf(" with datatype: '%s'", token.Datatype)
}
fmt.Printf("\n")
fmt.Printf("names:\n")
for _, name := range token.Names {
name_kind_name := ""
switch name.Kind {
case db.OriginalName:
name_kind_name = "original name"
case db.DemangledName:
name_kind_name = "demangled name"
case db.BinanaizedName:
name_kind_name = "binanaized name"
default:
panic(name.Kind)
}
fmt.Printf("%s '%s'\n", name_kind_name, name.Name)
}
fmt.Printf("--\n\n")
}
// attempt to match token and report to stdout
// if returns quit = true, the search is halted
func (token_query *token_query) match_token(token *db.Token) (quit bool) {
matched := false
sample, ok := token_query.sample_database[token.Source]
if !ok {
m, err := json.Marshal(token)
if err != nil {
panic(err)
}
fmt.Fprintln(os.Stderr, string(m))
app.Fatal(fmt.Errorf("a token references a sample (%s) that does not exist in the sample database. please fix your database", token.Source))
return
}
// filter out tokens from samples we don't care about
if sample.Build < token_query.params.MinBuild || sample.Build > token_query.params.MaxBuild {
return
}
if len(token_query.params.Program) > 0 {
if !slices.Contains(token_query.params.Program, sample.Program) {
return
}
}
if len(token_query.params.OS) > 0 {
if !slices.Contains(token_query.params.OS, sample.OS) {
return
}
}
if len(token_query.params.Arch) > 0 {
if !slices.Contains(token_query.params.Arch, sample.Arch) {
return
}
}
for _, name := range token.Names {
if token_query.token_regexp.MatchString(name.Name) {
matched = true
break
}
}
if matched {
token_query.present_token(token)
}
return
}
func (token_query *token_query) load_sample_database() (err error) {
token_query.sample_database = make(map[string]db.Sample)
var samples []db.Sample
samples, err = parquet.ReadFile[db.Sample]("db/samples.parquet")
if err != nil {
return
}
for _, sample := range samples {
token_query.sample_database[sample.ID] = sample
}
return
}
func Query(params *QueryParams) {
var token_query token_query
token_query.params = params
token_query.token_regexp = regexp.MustCompilePOSIX(token_query.params.Token)
if err := token_query.load_sample_database(); err != nil {
app.Fatal(err)
}
tokens_db_file, err := os.Open("db/tokens.parquet")
if err != nil {
app.Fatal(err)
}
rows := make([]db.Token, 1024)
reader := parquet.NewGenericReader[db.Token](tokens_db_file)
read_loop:
for {
n, err := reader.Read(rows)
if err != nil {
break
}
for _, token := range rows[:n] {
if token_query.match_token(&token) {
break read_loop
}
}
}
reader.Close()
tokens_db_file.Close()
}