feat(binana): add tokens database
Some checks failed
Push / build (push) Has been cancelled

This commit is contained in:
phaneron 2026-03-20 01:58:16 -04:00
parent ac268a16c8
commit 2c2815ab0b
22 changed files with 2122 additions and 2 deletions

3
db/samples.parquet Normal file
View file

@ -0,0 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:f19fac1fbb4db2383995a0285a30e1826e567e4198e35137ac773e0bad516401
size 6011

3
db/tokens.parquet Normal file
View file

@ -0,0 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:8c31761b8675ce1fee186061dc84c184b5eb23ec92368230abbf611ffc9143f7
size 156789790

12
go.mod
View file

@ -4,18 +4,26 @@ go 1.25.5
require (
github.com/fatih/color v1.18.0
github.com/pierrec/lz4/v4 v4.1.21
github.com/pierrec/lz4/v4 v4.1.26
github.com/spf13/cobra v1.8.1
modernc.org/cc/v3 v3.41.0
)
require (
github.com/andybalholm/brotli v1.2.0 // indirect
github.com/google/uuid v1.6.0 // indirect
github.com/inconshreveable/mousetrap v1.1.0 // indirect
github.com/klauspost/compress v1.18.4 // indirect
github.com/mattn/go-colorable v0.1.13 // indirect
github.com/mattn/go-isatty v0.0.20 // indirect
github.com/parquet-go/bitpack v1.0.0 // indirect
github.com/parquet-go/jsonlite v1.5.0 // indirect
github.com/parquet-go/parquet-go v0.29.0 // indirect
github.com/remyoudompheng/bigfft v0.0.0-20230129092748-24d4a6f8daec // indirect
github.com/spf13/pflag v1.0.5 // indirect
golang.org/x/sys v0.25.0 // indirect
github.com/twpayne/go-geom v1.6.1 // indirect
golang.org/x/sys v0.42.0 // indirect
google.golang.org/protobuf v1.36.11 // indirect
lukechampine.com/uint128 v1.3.0 // indirect
modernc.org/mathutil v1.6.0 // indirect
modernc.org/strutil v1.2.0 // indirect

21
go.sum
View file

@ -1,3 +1,5 @@
github.com/andybalholm/brotli v1.2.0 h1:ukwgCxwYrmACq68yiUqwIWnGY0cTPox/M94sVwToPjQ=
github.com/andybalholm/brotli v1.2.0/go.mod h1:rzTDkvFWvIrjDXZHkuS16NPggd91W3kUSvPlQ1pLaKY=
github.com/cpuguy83/go-md2man/v2 v2.0.4/go.mod h1:tgQtvFlXSQOSOSIRvRPT7W67SCa46tRHOmNcaadrF8o=
github.com/dustin/go-humanize v1.0.1 h1:GzkhY7T5VNhEkwH0PVJgjz+fX1rhBrR7pRT3mDkpeCY=
github.com/dustin/go-humanize v1.0.1/go.mod h1:Mu1zIs6XwVuF/gI1OepvI0qD18qycQx+mFykh5fBlto=
@ -5,15 +7,27 @@ github.com/fatih/color v1.18.0 h1:S8gINlzdQ840/4pfAwic/ZE0djQEH3wM94VfqLTZcOM=
github.com/fatih/color v1.18.0/go.mod h1:4FelSpRwEGDpQ12mAdzqdOukCy4u8WUtOY6lkT/6HfU=
github.com/google/go-cmp v0.5.9 h1:O2Tfq5qg4qc4AmwVlvv0oLiVAGB7enBSJ2x2DqQFi38=
github.com/google/go-cmp v0.5.9/go.mod h1:17dUlkBOakJ0+DkrSSNjCkIjxS6bF9zb3elmeNGIjoY=
github.com/google/uuid v1.6.0 h1:NIvaJDMOsjHA8n1jAhLSgzrAzy1Hgr+hNrb57e+94F0=
github.com/google/uuid v1.6.0/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo=
github.com/inconshreveable/mousetrap v1.1.0 h1:wN+x4NVGpMsO7ErUn/mUI3vEoE6Jt13X2s0bqwp9tc8=
github.com/inconshreveable/mousetrap v1.1.0/go.mod h1:vpF70FUmC8bwa3OWnCshd2FqLfsEA9PFc4w1p2J65bw=
github.com/klauspost/compress v1.18.4 h1:RPhnKRAQ4Fh8zU2FY/6ZFDwTVTxgJ/EMydqSTzE9a2c=
github.com/klauspost/compress v1.18.4/go.mod h1:R0h/fSBs8DE4ENlcrlib3PsXS61voFxhIs2DeRhCvJ4=
github.com/mattn/go-colorable v0.1.13 h1:fFA4WZxdEF4tXPZVKMLwD8oUnCTTo08duU7wxecdEvA=
github.com/mattn/go-colorable v0.1.13/go.mod h1:7S9/ev0klgBDR4GtXTXX8a3vIGJpMovkB8vQcUbaXHg=
github.com/mattn/go-isatty v0.0.16/go.mod h1:kYGgaQfpe5nmfYZH+SKPsOc2e4SrIfOl2e/yFXSvRLM=
github.com/mattn/go-isatty v0.0.20 h1:xfD0iDuEKnDkl03q4limB+vH+GxLEtL/jb4xVJSWWEY=
github.com/mattn/go-isatty v0.0.20/go.mod h1:W+V8PltTTMOvKvAeJH7IuucS94S2C6jfK/D7dTCTo3Y=
github.com/parquet-go/bitpack v1.0.0 h1:AUqzlKzPPXf2bCdjfj4sTeacrUwsT7NlcYDMUQxPcQA=
github.com/parquet-go/bitpack v1.0.0/go.mod h1:XnVk9TH+O40eOOmvpAVZ7K2ocQFrQwysLMnc6M/8lgs=
github.com/parquet-go/jsonlite v1.5.0 h1:ulS7lNWdPwiqDMLzTiXHYmIUhu99mavZh2iAVdXet3g=
github.com/parquet-go/jsonlite v1.5.0/go.mod h1:nDjpkpL4EOtqs6NQugUsi0Rleq9sW/OtC1NnZEnxzF0=
github.com/parquet-go/parquet-go v0.29.0 h1:xXlPtFVR51jpSVzf+cgHnNIcb7Xet+iuvkbe0HIm90Y=
github.com/parquet-go/parquet-go v0.29.0/go.mod h1:navtkAYr2LGoJVp141oXPlO/sxLvaOe3la2JEoD8+rg=
github.com/pierrec/lz4/v4 v4.1.21 h1:yOVMLb6qSIDP67pl/5F7RepeKYu/VmTyEXvuMI5d9mQ=
github.com/pierrec/lz4/v4 v4.1.21/go.mod h1:gZWDp/Ze/IJXGXf23ltt2EXimqmTUXEy0GFuRQyBid4=
github.com/pierrec/lz4/v4 v4.1.26 h1:GrpZw1gZttORinvzBdXPUXATeqlJjqUG/D87TKMnhjY=
github.com/pierrec/lz4/v4 v4.1.26/go.mod h1:EoQMVJgeeEOMsCqCzqFm2O0cJvljX2nGZjcRIPL34O4=
github.com/remyoudompheng/bigfft v0.0.0-20230129092748-24d4a6f8daec h1:W09IVJc94icq4NjY3clb7Lk8O1qJ8BdBEF8z0ibU0rE=
github.com/remyoudompheng/bigfft v0.0.0-20230129092748-24d4a6f8daec/go.mod h1:qqbHyh8v60DhA7CoWK5oRCqLrMHRGoxYCSS9EjAz6Eo=
github.com/russross/blackfriday/v2 v2.1.0/go.mod h1:+Rmxgy9KzJVeS9/2gXHxylqXiyQDYRxCVz55jmeOWTM=
@ -21,10 +35,17 @@ github.com/spf13/cobra v1.8.1 h1:e5/vxKd/rZsfSJMUX1agtjeTDf+qv1/JdBF8gg5k9ZM=
github.com/spf13/cobra v1.8.1/go.mod h1:wHxEcudfqmLYa8iTfL+OuZPbBZkmvliBWKIezN3kD9Y=
github.com/spf13/pflag v1.0.5 h1:iy+VFUOCP1a+8yFto/drg2CJ5u0yRoB7fZw3DKv/JXA=
github.com/spf13/pflag v1.0.5/go.mod h1:McXfInJRrz4CZXVZOBLb0bTZqETkiAhM9Iw0y3An2Bg=
github.com/twpayne/go-geom v1.6.1 h1:iLE+Opv0Ihm/ABIcvQFGIiFBXd76oBIar9drAwHFhR4=
github.com/twpayne/go-geom v1.6.1/go.mod h1:Kr+Nly6BswFsKM5sd31YaoWS5PeDDH2NftJTK7Gd028=
github.com/xyproto/randomstring v1.0.5/go.mod h1:rgmS5DeNXLivK7YprL0pY+lTuhNQW3iGxZ18UQApw/E=
golang.org/x/sys v0.0.0-20220811171246-fbc7d0a398ab/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
golang.org/x/sys v0.6.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
golang.org/x/sys v0.25.0 h1:r+8e+loiHxRqhXVl6ML1nO3l1+oFoWbnlu2Ehimmi34=
golang.org/x/sys v0.25.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA=
golang.org/x/sys v0.42.0 h1:omrd2nAlyT5ESRdCLYdm3+fMfNFE/+Rf4bDIQImRJeo=
golang.org/x/sys v0.42.0/go.mod h1:4GL1E5IUh+htKOUEOaiffhrAeqysfVGipDYzABqnCmw=
google.golang.org/protobuf v1.36.11 h1:fV6ZwhNocDyBLK0dj+fg8ektcVegBBuEolpbTQyBNVE=
google.golang.org/protobuf v1.36.11/go.mod h1:HTf+CrKn2C3g5S8VImy6tdcUvCska2kB7j23XfzDpco=
gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0=
gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=
lukechampine.com/uint128 v1.3.0 h1:cDdUVfRwDUDovz610ABgFD17nXD4/uDgVHl2sC3+sbo=

View file

@ -4,6 +4,9 @@ import (
_ "github.com/thunderbrewhq/binana/go/app/cmd/add_symbol"
_ "github.com/thunderbrewhq/binana/go/app/cmd/lint"
_ "github.com/thunderbrewhq/binana/go/app/cmd/make"
_ "github.com/thunderbrewhq/binana/go/app/cmd/make_samples"
_ "github.com/thunderbrewhq/binana/go/app/cmd/make_tokens"
_ "github.com/thunderbrewhq/binana/go/app/cmd/query"
"github.com/thunderbrewhq/binana/go/app/cmd/root"
_ "github.com/thunderbrewhq/binana/go/app/cmd/tidy"

View file

@ -0,0 +1,68 @@
package make_samples
import (
"github.com/spf13/cobra"
"github.com/thunderbrewhq/binana/go/app"
"github.com/thunderbrewhq/binana/go/app/cmd/root"
"github.com/thunderbrewhq/binana/go/app/util"
"github.com/thunderbrewhq/binana/go/app/util/dbutil"
)
var make_samples_cmd = cobra.Command{
Use: "make-samples",
Run: run_make_samples_command,
}
func init() {
f := make_samples_cmd.Flags()
f.StringP("source", "s", "", "required: source tree of sample binaries")
f.StringP("output-file", "o", "", "write the database to a file")
f.StringSlice("direct-mirror", nil, "list of direct mirror URLs that already contain the sample binaries")
f.StringSlice("ipfs-gateway", nil, "list of IPFS gateways")
f.StringP("format", "f", "json", "the format of the output database [json|parquet]")
root.RootCmd.AddCommand(&make_samples_cmd)
}
func run_make_samples_command(cmd *cobra.Command, args []string) {
f := cmd.Flags()
var (
params util.MakeSampleDatabaseParams
err error
format string
)
params.Source, err = f.GetString("source")
if err != nil {
app.Fatal(err)
}
if params.Source == "" {
cmd.Help()
return
}
params.Output, err = f.GetString("output-file")
if err != nil {
app.Fatal(err)
}
format, err = f.GetString("format")
if err != nil {
app.Fatal(err)
}
switch format {
case "json":
params.Format = dbutil.DatabaseJSON
case "parquet":
params.Format = dbutil.DatabaseParquet
default:
app.Fatal("unknown format", format)
}
params.DirectMirrors, err = f.GetStringSlice("direct-mirror")
if err != nil {
app.Fatal(err)
}
params.IPFSGateways, err = f.GetStringSlice("ipfs-gateway")
if err != nil {
app.Fatal(err)
}
util.MakeSampleDatabase(&params)
}

View file

@ -0,0 +1,57 @@
package make_tokens
import (
"github.com/spf13/cobra"
"github.com/thunderbrewhq/binana/go/app"
"github.com/thunderbrewhq/binana/go/app/cmd/root"
"github.com/thunderbrewhq/binana/go/app/util"
"github.com/thunderbrewhq/binana/go/app/util/dbutil"
)
var make_tokens_cmd = cobra.Command{
Use: "make-tokens",
Run: run_make_tokens_command,
}
func init() {
f := make_tokens_cmd.Flags()
f.StringP("source", "s", "", "required: source tree of sample binaries")
f.StringP("output-file", "o", "", "write the database to a file")
f.StringP("format", "f", "json", "the format of the output database [json|parquet]")
root.RootCmd.AddCommand(&make_tokens_cmd)
}
func run_make_tokens_command(cmd *cobra.Command, args []string) {
f := cmd.Flags()
var (
params util.MakeTokenDatabaseParams
err error
format string
)
params.Source, err = f.GetString("source")
if err != nil {
app.Fatal(err)
}
if params.Source == "" {
cmd.Help()
return
}
params.Output, err = f.GetString("output-file")
if err != nil {
app.Fatal(err)
}
format, err = f.GetString("format")
if err != nil {
app.Fatal(err)
}
switch format {
case "json":
params.Format = dbutil.DatabaseJSON
case "parquet":
params.Format = dbutil.DatabaseParquet
default:
app.Fatal("unknown format", format)
}
util.MakeTokenDatabase(&params)
}

72
go/app/cmd/query/query.go Normal file
View file

@ -0,0 +1,72 @@
package query
import (
"math"
"github.com/spf13/cobra"
"github.com/thunderbrewhq/binana/go/app"
"github.com/thunderbrewhq/binana/go/app/cmd/root"
"github.com/thunderbrewhq/binana/go/app/util"
)
var query_cmd = cobra.Command{
Use: "q regexp",
Args: cobra.MinimumNArgs(1),
Short: "query the token database for information",
Run: run_query_cmd,
}
func init() {
f := query_cmd.Flags()
f.Uint32("min-build", 0, "the minimum build to return tokens for")
f.Uint32("max-build", math.MaxUint32, "the maximum build to return tokens for")
f.StringSlice("program", nil, "a list of programs to return tokens for")
f.StringSlice("os", nil, "a list of kernel names to return tokens for (windows, darwin, linux)")
f.StringSlice("arch", nil, "a list of CPU architectures to return tokens for (ppc, 386, amd64)")
f.String("present", "normal", "control the way tokens are presented to console (normal, name-only)")
root.RootCmd.AddCommand(&query_cmd)
}
func run_query_cmd(cmd *cobra.Command, args []string) {
f := cmd.Flags()
var (
params util.QueryParams
err error
presentation_mode string
)
params.MinBuild, err = f.GetUint32("min-build")
if err != nil {
app.Fatal(err)
}
params.MaxBuild, err = f.GetUint32("max-build")
if err != nil {
app.Fatal(err)
}
params.Program, err = f.GetStringSlice("program")
if err != nil {
app.Fatal(err)
}
params.OS, err = f.GetStringSlice("os")
if err != nil {
return
}
params.Arch, err = f.GetStringSlice("arch")
if err != nil {
return
}
presentation_mode, err = f.GetString("present")
if err != nil {
return
}
switch presentation_mode {
case "normal":
params.Present = util.PresentQueryNormal
case "name-only":
params.Present = util.PresentQueryNameOnly
default:
cmd.Help()
return
}
params.Token = args[0]
util.Query(&params)
}

View file

@ -0,0 +1,14 @@
package dbutil
import "errors"
type DatabaseFormat uint8
const (
DatabaseParquet DatabaseFormat = iota
DatabaseJSON
)
var (
ErrUnknownDatabaseFormat = errors.New("dbutil: unknown database format")
)

View file

@ -0,0 +1,80 @@
package dbutil
import (
"encoding/json"
"fmt"
"io"
"os"
"github.com/parquet-go/parquet-go"
)
type Writer[T any] struct {
write func([]T) (err error)
close func() (err error)
}
func (writer *Writer[T]) WriteEntries(entries []T) (err error) {
err = writer.write(entries)
return
}
func (writer *Writer[T]) Close() (err error) {
err = writer.close()
return
}
func Open[T any](name string, format DatabaseFormat) (writer *Writer[T], err error) {
writer = new(Writer[T])
var (
file *os.File
output io.Writer
)
if name == "" {
output = os.Stdout
} else {
file, err = os.Create(name)
if err != nil {
return
}
output = file
}
switch format {
case DatabaseJSON:
encoder := json.NewEncoder(output)
writer.write = func(entries []T) (err error) {
for _, entry := range entries {
if err = encoder.Encode(&entry); err != nil {
return
}
}
return
}
writer.close = func() (err error) {
if file != nil {
err = file.Close()
}
return
}
case DatabaseParquet:
generic_writer := parquet.NewGenericWriter[T](output)
writer.write = func(entries []T) (err error) {
_, err = generic_writer.Write(entries)
return
}
writer.close = func() (err error) {
if err = generic_writer.Close(); err != nil {
return
}
if file != nil {
err = file.Close()
}
return
}
default:
err = fmt.Errorf("%w: %d", ErrUnknownDatabaseFormat, format)
}
return
}

49
go/app/util/demangle.go Normal file
View file

@ -0,0 +1,49 @@
package util
import (
"bytes"
"fmt"
"os/exec"
"strings"
)
var demangle_cache = make(map[string]string)
func demangle(str string) (demangled string, err error) {
var ok bool
if demangled, ok = demangle_cache[str]; ok {
return
}
var output bytes.Buffer
c := exec.Command("demumble", str)
c.Stdout = &output
c.Run()
if output.Len() == 0 {
err = fmt.Errorf("cannot demangle")
return
}
demangled = output.String()
demangled = strings.TrimSuffix(demangled, "\n")
if str == demangled {
err = fmt.Errorf("cannot demangle")
return
}
demangle_cache[str] = demangled
return
}
func looks_mangled(str string) bool {
if strings.HasPrefix(str, ".") {
return true
} else if strings.HasPrefix(str, "?") {
return true
} else if strings.HasPrefix(str, "_Z") {
return true
} else if len(str) > 0 && str[0] >= '0' && str[0] <= '9' {
return true
} else if strings.ContainsAny(str, "0123456789") && !strings.Contains(str, " ") {
return true
}
return false
}

31
go/app/util/exe.go Normal file
View file

@ -0,0 +1,31 @@
package util
import (
"debug/pe"
"os"
)
func get_exe_base_address(name string) (base_address uint64, err error) {
var file *os.File
file, err = os.Open(name)
if err != nil {
return
}
var (
pe_file *pe.File
)
pe_file, err = pe.NewFile(file)
if err != nil {
return
}
base_address = uint64(0x400000)
switch h := pe_file.OptionalHeader.(type) {
case *pe.OptionalHeader32:
base_address = uint64(h.ImageBase)
case *pe.OptionalHeader64:
base_address = h.ImageBase
}
file.Close()
return
}

19
go/app/util/hash.go Normal file
View file

@ -0,0 +1,19 @@
package util
import (
"crypto/sha256"
"encoding/hex"
"os"
)
func hash_file(name string) (id string, err error) {
var b []byte
b, err = os.ReadFile(name)
if err != nil {
return
}
h := sha256.New()
h.Write(b[:])
id = hex.EncodeToString(h.Sum(nil))
return
}

224
go/app/util/make-samples.go Normal file
View file

@ -0,0 +1,224 @@
package util
import (
"bufio"
"bytes"
"fmt"
"os"
"os/exec"
"path/filepath"
"strconv"
"strings"
"github.com/thunderbrewhq/binana/go/app"
"github.com/thunderbrewhq/binana/go/app/util/dbutil"
"github.com/thunderbrewhq/binana/go/db"
)
type MakeSampleDatabaseParams struct {
// A file name that corresponds to a tree of sample files.
// Anything in this tree will be collected into the sample database
Source string
// The name of the file to write the database to
Output string
// Sets the format of the database file
Format dbutil.DatabaseFormat
// URLs that maps to the root of the sample tree hierarchy.
// Used to generate a list of mirror URLs for sample binaries
DirectMirrors []string
// List of IPFS Gateway URLs
// If not empty, a CID for the sample tree will be created,
// Actually uploading anything in the sample tree, however,
// is outside the scope of this tool
IPFSGateways []string
}
type sample_database struct {
writer *dbutil.Writer[db.Sample]
ipfs_tree_cid string
buffer []db.Sample
}
func (sample_database *sample_database) add(sample db.Sample) (err error) {
sample_database.buffer = append(sample_database.buffer, sample)
return
}
func (sample_database *sample_database) Close() (err error) {
if err = sample_database.writer.WriteEntries(sample_database.buffer); err != nil {
return
}
err = sample_database.writer.Close()
return
}
func (sample_database *sample_database) make_sample_file(params *MakeSampleDatabaseParams, name, relative_name string) (err error) {
var sample db.Sample
// infer mime-type from extension
switch filepath.Ext(name) {
case ".exe":
sample.MimeType = "application/vnd.microsoft.portable-executable"
case ".pdb":
sample.MimeType = "application/x-ms-pdb"
// associate the PDB with its EXE
sample_exe_name := strings.TrimSuffix(name, ".pdb") + ".exe"
if _, err = os.Stat(sample_exe_name); err == nil {
sample.Executable, err = hash_file(sample_exe_name)
if err != nil {
panic(err)
}
}
case ".macho":
sample.MimeType = "application/x-mach-binary"
case ".elf":
sample.MimeType = "application/x-executable"
default:
// don't care about this
return
}
sample.ID, err = hash_file(name)
if err != nil {
panic(err)
}
// get the base filename
base_name := filepath.Base(name)
// split the base filename without its extension
filename_components := strings.Split(strings.TrimSuffix(base_name, filepath.Ext(base_name)), "-")
// now, parse the filename (these must be correctly named!)
sample.Program = filename_components[0]
sample.Version = filename_components[1]
var build uint64
build, err = strconv.ParseUint(filename_components[2], 0, 64)
if err != nil {
panic(err)
}
sample.Build = uint32(build)
sample.OS = filename_components[3]
sample.Arch = filename_components[4]
// now, create various mirrors
for _, direct_mirror := range params.DirectMirrors {
sample.Mirrors = append(sample.Mirrors, db.SampleMirror{
Kind: db.MirrorDirect,
URL: direct_mirror + relative_name,
})
}
for _, ipfs_gateway := range params.IPFSGateways {
sample.Mirrors = append(sample.Mirrors, db.SampleMirror{
Kind: db.MirrorIPFS,
URL: ipfs_gateway + "/" + sample_database.ipfs_tree_cid + relative_name,
})
}
// now write the sample
if err = sample_database.add(sample); err != nil {
return
}
return
}
func (sample_database *sample_database) make_tree(params *MakeSampleDatabaseParams, name, relative_name string) (err error) {
var (
tree_entries []os.DirEntry
)
tree_entries, err = os.ReadDir(name)
if err != nil {
return
}
for _, tree_entry := range tree_entries {
if tree_entry.IsDir() {
if err = sample_database.make_tree(params, name+"/"+tree_entry.Name(), relative_name+"/"+tree_entry.Name()); err != nil {
return
}
} else {
if err = sample_database.make_sample_file(params, name+"/"+tree_entry.Name(), relative_name+"/"+tree_entry.Name()); err != nil {
return
}
}
}
return
}
func ipfs_generate_file_cid(name string) (cid string, err error) {
// todo
// use command:
// ipfs add -qr --only-hash .
// inside the root of the sample tree
// the last CID is the root of the tree
var (
wd string
)
wd, err = os.Getwd()
if err != nil {
return
}
err = os.Chdir(name)
if err != nil {
return
}
command := exec.Command("ipfs", "add", "-qr", "--only-hash", ".")
var command_output bytes.Buffer
command.Stdout = &command_output
command.Run()
if command.ProcessState.ExitCode() != 0 {
os.Chdir(wd)
err = fmt.Errorf("util: ipfs tool exited: %d", command.ProcessState.ExitCode())
return
}
// Parse command Output
command_output_scanner := bufio.NewScanner(&command_output)
for command_output_scanner.Scan() {
cid = command_output_scanner.Text()
}
err = os.Chdir(wd)
return
}
func MakeSampleDatabase(params *MakeSampleDatabaseParams) {
var (
err error
sample_database sample_database
)
// if we want to generate IPFS links, start by getting the CID for the sample tree
if len(params.IPFSGateways) != 0 {
sample_database.ipfs_tree_cid, err = ipfs_generate_file_cid(params.Source)
if err != nil {
app.Fatal(err)
return
}
}
sample_database.writer, err = dbutil.Open[db.Sample](params.Output, params.Format)
if err != nil {
app.Fatal(err)
}
// make the root tree, with our params, the source as the first tree, and "" (root) as the relative path
if err = sample_database.make_tree(params, params.Source, ""); err != nil {
app.Fatal(err)
}
if err = sample_database.Close(); err != nil {
app.Fatal(err)
}
}

303
go/app/util/make-tokens.go Normal file
View file

@ -0,0 +1,303 @@
package util
import (
"compress/gzip"
"debug/macho"
"encoding/json"
"fmt"
"os"
"path/filepath"
"slices"
"strings"
"time"
"github.com/thunderbrewhq/binana/go/app"
"github.com/thunderbrewhq/binana/go/app/util/dbutil"
"github.com/thunderbrewhq/binana/go/db"
"github.com/thunderbrewhq/binana/go/pdbconv"
"github.com/thunderbrewhq/binana/go/stringrecovery"
)
type MakeTokenDatabaseParams struct {
Source string
Output string
Format dbutil.DatabaseFormat
}
func MakeTokenDatabase(params *MakeTokenDatabaseParams) {
var (
tokens_database tokens_database
err error
)
if err = tokens_database.Open(params.Output, params.Format); err != nil {
app.Fatal(err)
}
if err = tokens_database.make(params.Source); err != nil {
app.Fatal(err)
}
if err = tokens_database.Close(); err != nil {
app.Fatal(err)
}
}
type tokens_database struct {
sequence uint64
writer *dbutil.Writer[db.Token]
}
func (tokens_database *tokens_database) next_token_id() (id uint64) {
id = tokens_database.sequence
tokens_database.sequence++
return
}
func (tokens_database *tokens_database) Open(name string, format dbutil.DatabaseFormat) (err error) {
tokens_database.sequence = 1
tokens_database.writer, err = dbutil.Open[db.Token](name, format)
return
}
func (tokens_database *tokens_database) Close() (err error) {
err = tokens_database.writer.Close()
return
}
func (tokens_database *tokens_database) Write(token *db.Token) (err error) {
tokens := []db.Token{*token}
if err = tokens_database.writer.WriteEntries(tokens); err != nil {
return
}
return
}
func (tokens_database *tokens_database) make_file_pdb(name string) (err error) {
exe_name := strings.TrimSuffix(name, ".pdb") + ".exe"
var base_address uint64
base_address, err = get_exe_base_address(exe_name)
if err != nil {
return
}
fmt.Fprintln(os.Stderr, "[pdb]", name)
var source_id string
source_id, err = hash_file(name)
if err != nil {
return
}
fmt.Fprintln(os.Stderr, "[pdb]", source_id)
// check for the existence of an alternate, .pdb.json.gz file
_, err = os.Stat(name + ".json.gz")
if err != nil {
return
}
var (
gzip_file *os.File
gzip_reader *gzip.Reader
)
gzip_file, err = os.Open(name + ".json.gz")
if err != nil {
return
}
var pdb pdbconv.ProgramDatabase
gzip_reader, err = gzip.NewReader(gzip_file)
json_decoder := json.NewDecoder(gzip_reader)
if err = json_decoder.Decode(&pdb); err != nil {
return
}
gzip_file.Close()
var v pdb_token_visitor
v.init(tokens_database, source_id, base_address)
if err = v.visit_all(&pdb); err != nil {
return
}
if err = v.write_tokens(); err != nil {
return
}
return
}
func (tokens_database *tokens_database) write_string_token(source_id string, section_name string, address uint64, str string) (err error) {
var db_token db.Token
db_token.ID = tokens_database.next_token_id()
db_token.Source = source_id
db_token.Section = section_name
db_token.Kind = db.OriginalStringToken
db_token.Offset = fmt.Sprintf("%X", address)
db_token.Names = append(db_token.Names, db.TokenName{db.OriginalName, str})
// detect if this is a mangled type identifier
if looks_mangled(str) {
demangled, err := demangle(str)
if err == nil {
db_token.Names = append(db_token.Names, db.TokenName{db.DemangledName, demangled})
}
}
err = tokens_database.Write(&db_token)
return
}
func (tokens_database *tokens_database) make_file_pe(name string) (err error) {
var id string
id, err = hash_file(name)
if err != nil {
return
}
err = stringrecovery.RecoverFile(name, func(section_name string, address uint64, str string) {
fmt.Fprintf(os.Stderr, "[pe] string found: %s %08X %s\n", section_name, address, str)
tokens_database.write_string_token(id, section_name, address, str)
})
return
}
func (tokens_database *tokens_database) make_file_macho(name string) (err error) {
fmt.Fprintln(os.Stderr, "[mach-o]", name)
var source_id string
source_id, err = hash_file(name)
if err != nil {
return
}
fmt.Fprintln(os.Stderr, "[mach-o]", source_id)
var (
file *os.File
macho_file *macho.File
)
file, err = os.Open(name)
if err != nil {
return
}
macho_file, err = macho.NewFile(file)
if err != nil {
return
}
_, dwarf_err := macho_file.DWARF()
if dwarf_err == nil {
fmt.Fprintln(os.Stderr, "DWARF!")
time.Sleep(5 * time.Second)
}
fmt.Fprintln(os.Stderr, "[mach-o]", "cpu", macho_file.FileHeader.Cpu)
fmt.Fprintln(os.Stderr, "[mach-o]", "loads:")
// for _, load := range macho_file.Loads {
// fmt.Fprintln(os.Stderr, "[mach-o]", load.String())
// }
fmt.Fprintln(os.Stderr, "[mach-o]", "sections:")
for _, section := range macho_file.Sections {
fmt.Fprintln(os.Stderr, "section", section.SectionHeader.Name)
}
if macho_file.Dysymtab != nil {
fmt.Fprintln(os.Stderr, "[mach-o]", "does not contain a dysymtab")
} else {
fmt.Fprintln(os.Stderr, "[mach-o]", "dysymtab:")
}
var imported_symbols []string
imported_symbols, err = macho_file.ImportedSymbols()
if err != nil {
return
}
if macho_file.Symtab == nil {
fmt.Fprintln(os.Stderr, "[mach-o]", "does not contain a symtab")
} else {
fmt.Fprintln(os.Stderr, "[mach-o]", "symtab:")
for _, sym := range macho_file.Symtab.Syms {
imported := slices.Contains(imported_symbols, sym.Name)
var section_name string
if sym.Sect != 0 {
section_name = macho_file.Sections[sym.Sect-1].SectionHeader.Name
}
if imported {
fmt.Fprintf(os.Stderr, "[mach-o] imported %s %02x %s\n", section_name, sym.Type, sym.Name)
} else {
if sym.Name == "" {
// fmt.Fprintln(os.Stderr, "[mach-o]", "symbol has no name", "sect="+section_name, sym.Type, sym.Value, sym.Desc)
} else {
fmt.Fprintf(os.Stderr, "[mach-o] internal %s %02x %s\n", section_name, sym.Type, sym.Name)
var token db.Token
token.ID = tokens_database.next_token_id()
token.Source = source_id
token.Kind = db.OriginalSymbolToken
token.Section = section_name
token.Offset = fmt.Sprintf("%X", sym.Value)
token.Names = append(token.Names, db.TokenName{db.OriginalName, sym.Name})
if looks_mangled(sym.Name) {
demangled, err := demangle(sym.Name)
if err == nil {
token.Names = append(token.Names, db.TokenName{db.DemangledName, demangled})
}
}
tokens_database.Write(&token)
}
}
}
}
file.Close()
if err = stringrecovery.RecoverFile(name, func(section_name string, address uint64, str string) {
fmt.Fprintf(os.Stderr, "[mach-o] string found: %s %08X %s\n", section_name, address, str)
tokens_database.write_string_token(source_id, section_name, address, str)
}); err != nil {
return
}
return
}
func (tokens_database *tokens_database) make_file(name string) (err error) {
switch filepath.Ext(name) {
case ".macho":
err = tokens_database.make_file_macho(name)
case ".pdb":
err = tokens_database.make_file_pdb(name)
case ".exe":
err = tokens_database.make_file_pe(name)
}
return
}
func (tokens_database *tokens_database) make_directory(name string) (err error) {
var entries []os.DirEntry
entries, err = os.ReadDir(name)
if err != nil {
return
}
for _, entry := range entries {
if entry.IsDir() {
if err = tokens_database.make_directory(filepath.Join(name, entry.Name())); err != nil {
return
}
} else {
if err = tokens_database.make_file(filepath.Join(name, entry.Name())); err != nil {
return
}
}
}
return
}
func (tokens_database *tokens_database) make(name string) (err error) {
var fi os.FileInfo
fi, err = os.Stat(name)
if err != nil {
return
}
if fi.IsDir() {
err = tokens_database.make_directory(name)
} else {
err = tokens_database.make_file(name)
}
return
}

474
go/app/util/pdb.go Normal file
View file

@ -0,0 +1,474 @@
package util
import (
"crypto/sha256"
"encoding/hex"
"fmt"
"maps"
"slices"
"sort"
"strconv"
"github.com/thunderbrewhq/binana/go/db"
"github.com/thunderbrewhq/binana/go/pdbconv"
)
type pdb_token_visitor struct {
tokens_database *tokens_database
// location of the base module
base_address uint64
pdb_source_id string
strings map[string]*db.Token
datatypes map[string]*db.Token
// maps a symbol to a token
symbols map[uint64]*db.Token
constants []db.Token
}
func (v *pdb_token_visitor) init(tokens_database *tokens_database, pdb_source_id string, base_address uint64) {
v.base_address = base_address
v.pdb_source_id = pdb_source_id
v.strings = make(map[string]*db.Token)
v.datatypes = make(map[string]*db.Token)
// map of address to token
v.symbols = make(map[uint64]*db.Token)
v.tokens_database = tokens_database
}
func (v *pdb_token_visitor) visit_class(class *pdbconv.Class) (err error) {
token, ok := v.datatypes[class.Name]
if !ok {
token = new(db.Token)
v.datatypes[class.Name] = token
}
// set token source to pdb
token.Source = v.pdb_source_id
// kind is debug information token
token.Kind = db.OriginalDatatypeToken
// set original name
token.Names = append(token.Names, db.TokenName{db.OriginalName, class.Name})
// set the basic type
token.Keyword = "class"
for _, member := range class.Members {
var token_member db.TokenMember
if member.Kind == "Member" {
token_member.Kind = db.FieldMember
} else if member.Kind == "Unknown" && member.Datatype == "void *" {
token_member.Kind = db.MethodMember
} else {
continue
}
token_member.Key = member.Name
token_member.Value = member.Datatype
if !slices.Contains(token.Members, token_member) {
token.Members = append(token.Members, token_member)
}
}
return
}
func (v *pdb_token_visitor) visit_datatype(datatype *pdbconv.Datatype) (err error) {
token, ok := v.datatypes[datatype.Name]
if !ok {
token = new(db.Token)
v.datatypes[datatype.Name] = token
}
// set token source to pdb
token.Source = v.pdb_source_id
// kind is debug information token
token.Kind = db.OriginalDatatypeToken
// set original name
token.Names = append(token.Names, db.TokenName{db.OriginalName, datatype.Name})
// set the basic type
if datatype.Kind == "Structure" {
token.Keyword = "struct"
} else if datatype.Kind == "Union" {
token.Keyword = "union"
} else {
err = fmt.Errorf("unhandled datatype kind '%s'", datatype.Kind)
return
}
for _, member := range datatype.Members {
var token_member db.TokenMember
if member.Kind == "Member" {
token_member.Kind = db.FieldMember
} else if member.Kind == "Unknown" && member.Datatype == "void *" {
token_member.Kind = db.MethodMember
} else {
continue
}
token_member.Key = member.Name
token_member.Value = member.Datatype
if !slices.Contains(token.Members, token_member) {
token.Members = append(token.Members, token_member)
}
}
return
}
func (v *pdb_token_visitor) visit_function(function *pdbconv.Function) (err error) {
var address uint64
address, err = strconv.ParseUint(function.Address, 0, 64)
if err != nil {
return
}
address = v.base_address + address
token, ok := v.symbols[address]
if !ok {
token = new(db.Token)
v.symbols[address] = token
}
// set token source to pdb
token.Source = v.pdb_source_id
// kind is symbol information token
token.Kind = db.OriginalSymbolToken
// set address
token.Offset = fmt.Sprintf("%X", address)
// set original name
token.Names = append(token.Names, db.TokenName{db.OriginalName, function.Name})
if looks_mangled(function.Name) {
demangled, demangler_err := demangle(function.Name)
if demangler_err == nil {
token.Names = append(token.Names, db.TokenName{db.DemangledName, demangled})
}
}
// visit source files
for _, line_number := range function.LineNumbers {
if err = v.visit_string(line_number.SourceFile); err != nil {
return
}
}
// classify stack variables as members
for _, stack_variable := range function.StackVariables {
var token_member db.TokenMember
if stack_variable.Kind == "Parameter" || stack_variable.Kind == "ObjectPointer" {
token_member.Kind = db.ParameterMember
} else if stack_variable.Kind == "Local" {
token_member.Kind = db.LocalMember
} else if stack_variable.Kind == "StaticLocal" {
token_member.Kind = db.StaticLocalMember
} else if stack_variable.Kind == "Constant" {
// these are repeated elsewhere
continue
} else {
err = fmt.Errorf("unhandled stack variable kind '%s'", stack_variable.Kind)
return
}
token_member.Key = stack_variable.Name
token_member.Value = stack_variable.Datatype
token.Members = append(token.Members, token_member)
}
return
}
func (v *pdb_token_visitor) visit_enum(enum *pdbconv.Enum) (err error) {
// create hash of enum's contents and use to key the datatype
h := sha256.New()
h.Write([]byte(enum.Name))
for _, member := range enum.Members {
h.Write([]byte(member.Name))
h.Write([]byte(fmt.Sprintf("%d", member.Value)))
}
name := hex.EncodeToString(h.Sum(nil))
token, ok := v.datatypes[name]
if !ok {
token = new(db.Token)
v.datatypes[name] = token
}
token.Source = v.pdb_source_id
token.Keyword = "enum"
// apply name (may be __unnamed)
token.Names = append(token.Names, db.TokenName{db.OriginalName, enum.Name})
// this is an original datatype
token.Kind = db.OriginalDatatypeToken
for _, member := range enum.Members {
var token_member db.TokenMember
token_member.Kind = db.EnumMember
token_member.Key = member.Name
token_member.Value = fmt.Sprintf("%d", member.Value)
if !slices.Contains(token.Members, token_member) {
token.Members = append(token.Members, token_member)
}
}
return
}
func (v *pdb_token_visitor) visit_string(s string) (err error) {
token, ok := v.strings[s]
if !ok {
token = new(db.Token)
v.strings[s] = token
}
// apply source
token.Source = v.pdb_source_id
// this is a string token
token.Kind = db.OriginalStringToken
// add name
var token_name db.TokenName
token_name.Kind = db.OriginalName
token_name.Name = s
token.Names = []db.TokenName{token_name}
return
}
func (v *pdb_token_visitor) visit_source_files_table(table *pdbconv.Table) (err error) {
for _, source_file := range table.SourceFiles {
if err = v.visit_string(source_file.Name); err != nil {
return
}
}
return
}
func (v *pdb_token_visitor) visit_constant(symbol *pdbconv.TableSymbol) (err error) {
var token db.Token
token.ID = v.tokens_database.next_token_id()
token.Source = v.pdb_source_id
token.Keyword = "const"
token.Datatype = symbol.Datatype
if symbol.Name != "" {
var name db.TokenName
name.Kind = db.OriginalName
name.Name = symbol.Name
token.Names = append(token.Names, name)
}
if symbol.Undecorated != "" {
var name db.TokenName
name.Kind = db.OriginalName
name.Name = symbol.Undecorated
token.Names = append(token.Names, name)
}
var value db.TokenMember
value.Kind = db.ConstantValueMember
value.Value = symbol.Value
token.Members = append(token.Members, value)
v.constants = append(v.constants, token)
return
}
func (v *pdb_token_visitor) visit_table_symbol(symbol *pdbconv.TableSymbol) (err error) {
if symbol.Address == "0x0" {
if symbol.Value != "" && symbol.Kind == "Constant" {
err = v.visit_constant(symbol)
return
}
// so, this does not correspond to an actual symbol.
// we can still mine it for string tokens.
if symbol.Name != "" {
if err = v.visit_string(symbol.Name); err != nil {
return
}
}
if symbol.Undecorated != "" {
if err = v.visit_string(symbol.Name); err != nil {
return
}
}
return
}
// this corresponds to an address
// compute the real address
var address uint64
address, err = strconv.ParseUint(symbol.Address, 0, 64)
if err != nil {
return
}
address = v.base_address + address
token, ok := v.symbols[address]
if !ok {
token = new(db.Token)
v.symbols[address] = token
}
token.Source = v.pdb_source_id
if symbol.Datatype != "" {
token.Datatype = symbol.Datatype
}
if symbol.Kind == "FileStatic" {
token.Keyword = "static"
} else if symbol.Kind == "Global" {
token.Keyword = "global"
}
if symbol.Name != "" {
token_name := db.TokenName{db.OriginalName, symbol.Name}
if !slices.Contains(token.Names, token_name) {
token.Names = append(token.Names, token_name)
}
}
if symbol.Undecorated != "" {
undecorated := db.TokenName{db.OriginalName, symbol.Undecorated}
if !slices.Contains(token.Names, undecorated) {
token.Names = append(token.Names, undecorated)
}
}
return
}
func (v *pdb_token_visitor) visit_symbols_table(table *pdbconv.Table) (err error) {
for _, symbol := range table.Symbols {
if err = v.visit_table_symbol(&symbol); err != nil {
return
}
}
return
}
func (v *pdb_token_visitor) visit_table(table *pdbconv.Table) (err error) {
if table.Name == "SourceFiles" {
err = v.visit_source_files_table(table)
} else if table.Name == "Symbols" {
err = v.visit_symbols_table(table)
}
return
}
func (v *pdb_token_visitor) visit_typedef(typedef *pdbconv.Typedef) (err error) {
token, ok := v.datatypes[typedef.Name]
if !ok {
token = new(db.Token)
v.datatypes[typedef.Name] = token
}
token.Source = v.pdb_source_id
token.Kind = db.OriginalDatatypeToken
var token_name db.TokenName
token_name.Kind = db.OriginalName
token_name.Name = typedef.Name
if !slices.Contains(token.Names, token_name) {
token.Names = append(token.Names, token_name)
}
token.Datatype = typedef.Basetype
return
}
func (v *pdb_token_visitor) visit_all(pdb *pdbconv.ProgramDatabase) (err error) {
for _, class := range pdb.Classes {
if err = v.visit_class(&class); err != nil {
return
}
}
for _, datatype := range pdb.Datatypes {
if err = v.visit_datatype(&datatype); err != nil {
return
}
}
for _, enum := range pdb.Enums {
if err = v.visit_enum(&enum); err != nil {
return
}
}
for _, function := range pdb.Functions {
if err = v.visit_function(&function); err != nil {
return
}
}
for _, table := range pdb.Tables {
if err = v.visit_table(&table); err != nil {
return
}
}
for _, typedef := range pdb.Typedefs {
if err = v.visit_typedef(&typedef); err != nil {
return
}
}
return
}
func (v *pdb_token_visitor) write_tokens() (err error) {
datatypes := slices.Collect(maps.Keys(v.datatypes))
sort.Strings(datatypes)
symbols := slices.Collect(maps.Keys(v.symbols))
slices.SortFunc(symbols, func(a, b uint64) int {
if a < b {
return -1
} else if a == b {
return 0
}
return 1
})
strings := slices.Collect(maps.Keys(v.strings))
sort.Strings(strings)
for _, datatype := range datatypes {
if err = v.tokens_database.Write(v.datatypes[datatype]); err != nil {
return
}
}
for _, symbol := range symbols {
if err = v.tokens_database.Write(v.symbols[symbol]); err != nil {
return
}
}
for _, string := range strings {
if err = v.tokens_database.Write(v.strings[string]); err != nil {
return
}
}
for _, constant := range v.constants {
if err = v.tokens_database.Write(&constant); err != nil {
return
}
}
return
}

189
go/app/util/query.go Normal file
View file

@ -0,0 +1,189 @@
package util
import (
"encoding/json"
"fmt"
"os"
"regexp"
"slices"
"github.com/parquet-go/parquet-go"
"github.com/thunderbrewhq/binana/go/app"
"github.com/thunderbrewhq/binana/go/db"
)
type QueryPresentationMode uint8
const (
PresentQueryNormal QueryPresentationMode = iota
PresentQueryNameOnly
)
type QueryParams struct {
//
Present QueryPresentationMode
// Match pattern for profile
Profile string
// Possible values for Program
Program []string
// Possible values for OS
OS []string
// Possible values for arch
Arch []string
// Range of builds to return information for
MinBuild uint32
MaxBuild uint32
// Regular expression for tokens (symbols/type information)
Token string
}
type token_query struct {
params *QueryParams
sample_database map[string]db.Sample
token_regexp *regexp.Regexp
}
func (token_query *token_query) present_token(token *db.Token) {
if token_query.params.Present == PresentQueryNameOnly {
for _, name := range token.Names {
if token_query.token_regexp.MatchString(name.Name) {
fmt.Println(name.Name)
}
}
return
}
kind_name := ""
switch token.Kind {
case db.OriginalConstantToken:
kind_name = "original constant"
case db.OriginalDatatypeToken:
kind_name = "original datatype"
case db.OriginalStringToken:
kind_name = "original string"
case db.OriginalSymbolToken:
kind_name = "original symbol"
default:
return
}
fmt.Printf("%s in sample: '%s' section: '%s'", kind_name, token.Source[:8], token.Section)
if token.Offset != "" {
fmt.Printf(" at %s", token.Offset)
}
if token.Datatype != "" {
fmt.Printf(" with datatype: '%s'", token.Datatype)
}
fmt.Printf("\n")
fmt.Printf("names:\n")
for _, name := range token.Names {
name_kind_name := ""
switch name.Kind {
case db.OriginalName:
name_kind_name = "original name"
case db.DemangledName:
name_kind_name = "demangled name"
case db.BinanaizedName:
name_kind_name = "binanaized name"
default:
panic(name.Kind)
}
fmt.Printf("%s '%s'\n", name_kind_name, name.Name)
}
fmt.Printf("--\n\n")
}
// attempt to match token and report to stdout
// if returns quit = true, the search is halted
func (token_query *token_query) match_token(token *db.Token) (quit bool) {
matched := false
sample, ok := token_query.sample_database[token.Source]
if !ok {
m, err := json.Marshal(token)
if err != nil {
panic(err)
}
fmt.Fprintln(os.Stderr, string(m))
app.Fatal(fmt.Errorf("a token references a sample (%s) that does not exist in the sample database. please fix your database", token.Source))
return
}
// filter out tokens from samples we don't care about
if sample.Build < token_query.params.MinBuild || sample.Build > token_query.params.MaxBuild {
return
}
if len(token_query.params.Program) > 0 {
if !slices.Contains(token_query.params.Program, sample.Program) {
return
}
}
if len(token_query.params.OS) > 0 {
if !slices.Contains(token_query.params.OS, sample.OS) {
return
}
}
if len(token_query.params.Arch) > 0 {
if !slices.Contains(token_query.params.Arch, sample.Arch) {
return
}
}
for _, name := range token.Names {
if token_query.token_regexp.MatchString(name.Name) {
matched = true
break
}
}
if matched {
token_query.present_token(token)
}
return
}
func (token_query *token_query) load_sample_database() (err error) {
token_query.sample_database = make(map[string]db.Sample)
var samples []db.Sample
samples, err = parquet.ReadFile[db.Sample]("db/samples.parquet")
if err != nil {
return
}
for _, sample := range samples {
token_query.sample_database[sample.ID] = sample
}
return
}
func Query(params *QueryParams) {
var token_query token_query
token_query.params = params
token_query.token_regexp = regexp.MustCompilePOSIX(token_query.params.Token)
if err := token_query.load_sample_database(); err != nil {
app.Fatal(err)
}
tokens_db_file, err := os.Open("db/tokens.parquet")
if err != nil {
app.Fatal(err)
}
rows := make([]db.Token, 1024)
reader := parquet.NewGenericReader[db.Token](tokens_db_file)
read_loop:
for {
n, err := reader.Read(rows)
if err != nil {
break
}
for _, token := range rows[:n] {
if token_query.match_token(&token) {
break read_loop
}
}
}
reader.Close()
tokens_db_file.Close()
}

47
go/db/sample.go Normal file
View file

@ -0,0 +1,47 @@
package db
type MirrorKind uint8
const (
MirrorDirect MirrorKind = iota
MirrorIPFS
)
type SampleMirror struct {
Kind MirrorKind `json:"kind" parquet:"kind"`
URL string `json:"url" parquet:"url,delta"`
}
type Sample struct {
// The SHA-256 sum of the sample file
ID string `json:"id" parquet:"id,dict"`
// <Optional> if this is a debugging file, then this is a SHA-256 sum
// which references another sample file, which is the executable file
Executable string `json:"exe,omitempty" parquet:"exe,dict"`
// This is the MIME type identifier of the sample file.
// Possible sample types include:
// * (Windows .exe) application/vnd.microsoft.portable-executable
// * (Mach-O binary) application/x-mach-binary
// * (Linux binary) application/x-elf
MimeType string `json:"mimetype" parquet:"mimetype,dict"`
// This is the code that signifies which program the sample is a build of.
Program string `json:"program" parquet:"program,dict"`
// <Optional> This is the build sequence of the sample e.g. 12340
Build uint32 `json:"build,omitempty" parquet:"build"`
// <Optional> This is the semantic version/release id of the sample e.g. 3.3.5a
Version string `json:"version,omitempty" parquet:"version"`
// The OS of the sample, uses GOOS naming convention
OS string `json:"os" parquet:"os,dict"`
// The architecture of the sample, uses GOARCH naming convention
Arch string `json:"arch" parquet:"arch,dict"`
// A URL where the sample can be downloaded
Mirrors []SampleMirror `json:"mirrors,omitempty" parquet:"mirrors"`
}

79
go/db/token.go Normal file
View file

@ -0,0 +1,79 @@
package db
type (
TokenKind uint8
TokenNameKind uint8
TokenMemberKind uint8
)
const (
// OriginalName means this string appeared verbatim in the original sample, and was not altered.
OriginalName TokenNameKind = iota
// DemangledName is provided in the case that the OriginalName was mangled by the compiler.
DemangledName
// Mangled names can be automatically Binanaized, i.e. converted into a naive syntax for wide
// compatibility with SRE tools
BinanaizedName
)
const (
// The token was obtained from a PDB or a Mach-O symtab
OriginalSymbolToken TokenKind = iota
// The token was found by scanning the non-executable sections of the binary for 0-terminated ASCII strings
OriginalStringToken
// The token is a datatype was obtained from a PDB or DWARF debugging file
OriginalDatatypeToken
// This token is a constant named value with no address
OriginalConstantToken
)
const (
ConstantValueMember TokenMemberKind = iota
EnumMember
// This is a part of a struct
// key = the field name
// value = the C type of the field
FieldMember
// This is a method of a class
MethodMember
// This in argument to a function
ParameterMember
// This is a local variable in a function
LocalMember
// This is a statically declared variable in a function
StaticLocalMember
)
type TokenName struct {
Kind TokenNameKind `json:"kind" parquet:"kind"`
Name string `json:"name" parquet:"name,dict"`
}
type TokenMember struct {
Kind TokenMemberKind `json:"kind" parquet:"kind"`
Key string `json:"key,omitempty" parquet:"key,dict"`
Value string `json:"value" parquet:"value,dict"`
}
type Token struct {
// Unique 64-bit identifier
ID uint64 `json:"id" parquet:"id"`
// The SHA-256 hash id of the sample which generated the token
Source string `json:"src" parquet:"src,dict"`
// The color and subhead of the token
Kind TokenKind `json:"kind" parquet:"kind"`
// If this is a datatype, keyword tells you what kind of datatype it is. Useful when generating C code.
Keyword string `json:"keyword,omitempty" parquet:"keyword,dict"`
// If this is a global variable/constant, this tells you the data type
Datatype string `json:"datatype,omitempty" parquet:"datatype,dict"`
// The section where the token originated
Section string `json:"section,omitempty"`
// The offset (in hexadecimal) where the symbol
Offset string `json:"offset,omitempty"`
// Alternate names for the token
Names []TokenName `json:"names"`
// Clickable references to other tokens
Highlights []string `json:"crumbs,omitempty"`
// Struct/Enum members
Members []TokenMember `json:"members,omitempty"`
}

109
go/pdbconv/db.go Normal file
View file

@ -0,0 +1,109 @@
package pdbconv
type ClassMember struct {
Datatype string `json:"datatype,omitempty"`
Kind string `json:"kind,omitempty"`
Length uint64 `json:"length,omitempty"`
Name string `json:"name,omitempty"`
Offset uint64 `json:"offset,omitempty"`
}
type Class struct {
Length string `json:"length,omitempty"`
Name string `json:"name,omitempty"`
Members []ClassMember `json:"member,omitempty"`
}
type DatatypeMember struct {
Datatype string `json:"datatype,omitempty"`
Kind string `json:"kind,omitempty"`
Length uint64 `json:"length,omitempty"`
Name string `json:"name,omitempty"`
Offset uint64 `json:"offset,omitempty"`
}
type Datatype struct {
Kind string `json:"kind,omitempty"`
Length string `json:"length,omitempty"`
Name string `json:"name,omitempty"`
Members []DatatypeMember `json:"member,omitempty"`
}
type EnumMember struct {
Name string `json:"name,omitempty"`
Value int `json:"value,omitempty"`
}
type Enum struct {
Length uint64 `json:"length,omitempty"`
Name string `json:"name,omitempty"`
Type string `json:"type,omitempty"`
Members []EnumMember `json:"member,omitempty"`
}
type FunctionLineNumber struct {
Address string `json:"addr,omitempty"`
End int `json:"end,omitempty"`
Length int `json:"length,omitempty"`
SourceFile string `json:"source_file,omitempty"`
Start int `json:"start,omitempty"`
}
type FunctionStackVariable struct {
Datatype string `json:"datatype,omitempty"`
Kind string `json:"kind,omitempty"`
Length uint64 `json:"length,omitempty"`
Name string `json:"name,omitempty"`
Offset uint64 `json:"offset,omitempty"`
}
type Function struct {
Address string `json:"address,omitempty"`
Length uint64 `json:"length,omitempty"`
Name string `json:"name,omitempty"`
LineNumbers []FunctionLineNumber `json:"line_numbers,omitempty"`
StackVariables []FunctionStackVariable `json:"stack_variables,omitempty"`
}
type TableSegment struct {
Address string `json:"address,omitempty"`
Number int `json:"number,omitempty"`
}
type TableSourceFile struct {
ID string `json:"id,omitempty"`
Name string `json:"name,omitempty"`
}
type TableSymbol struct {
Address string `json:"address,omitempty"`
Datatype string `json:"datatype,omitempty"`
Index uint64 `json:"index,omitempty"`
Kind string `json:"kind,omitempty"`
Length uint64 `json:"length,omitempty"`
Name string `json:"name,omitempty"`
Tag string `json:"tag,omitempty"`
Undecorated string `json:"undecorated,omitempty"`
Value string `json:"value,omitempty"`
}
type Table struct {
Name string `json:"name,omitempty"`
Segments []TableSegment `json:"segments,omitempty"`
SourceFiles []TableSourceFile `json:"source_files,omitempty"`
Symbols []TableSymbol `json:"symbols,omitempty"`
}
type Typedef struct {
Basetype string `json:"basetype,omitempty"`
Name string `json:"name,omitempty"`
}
type ProgramDatabase struct {
Classes []Class `json:"classes,omitempty"`
Datatypes []Datatype `json:"datatypes,omitempty"`
Enums []Enum `json:"enums,omitempty"`
Functions []Function `json:"functions,omitempty"`
Tables []Table `json:"tables,omitempty"`
Typedefs []Typedef `json:"typedefs,omitempty"`
}

93
go/pdbconv/xml.go Normal file
View file

@ -0,0 +1,93 @@
package pdbconv
type GhidraXml struct {
Classes struct {
Class []struct {
Length string `xml:"length,attr"`
Name string `xml:"name,attr"`
Member []struct {
Datatype string `xml:"datatype,attr"`
Kind string `xml:"kind,attr"`
Length string `xml:"length,attr"`
Name string `xml:"name,attr"`
Offset string `xml:"offset,attr"`
} `xml:"member"`
} `xml:"class"`
} `xml:"classes"`
Datatypes struct {
Datatype []struct {
Kind string `xml:"kind,attr"`
Length string `xml:"length,attr"`
Name string `xml:"name,attr"`
Member []struct {
Datatype string `xml:"datatype,attr"`
Kind string `xml:"kind,attr"`
Length string `xml:"length,attr"`
Name string `xml:"name,attr"`
Offset string `xml:"offset,attr"`
} `xml:"member"`
} `xml:"datatype"`
} `xml:"datatypes"`
Enums struct {
Enum []struct {
Length string `xml:"length,attr"`
Name string `xml:"name,attr"`
Type string `xml:"type,attr"`
Member []struct {
Name string `xml:"name,attr"`
Value int `xml:"value,attr"`
} `xml:"member"`
} `xml:"enum"`
} `xml:"enums"`
Functions struct {
Function []struct {
Address string `xml:"address,attr"`
Length string `xml:"length,attr"`
Name string `xml:"name,attr"`
LineNumber []struct {
Addr string `xml:"addr,attr"`
End int `xml:"end,attr"`
Length int `xml:"length,attr"`
SourceFile string `xml:"source_file,attr"`
Start int `xml:"start,attr"`
} `xml:"line_number"`
StackVariable []struct {
Datatype string `xml:"datatype,attr"`
Kind string `xml:"kind,attr"`
Length string `xml:"length,attr"`
Name string `xml:"name,attr"`
Offset string `xml:"offset,attr"`
} `xml:"stack_variable"`
} `xml:"function"`
} `xml:"functions"`
Tables struct {
Table []struct {
Name string `xml:"name,attr"`
Segment []struct {
Address string `xml:"address,attr"`
Number int `xml:"number,attr"`
} `xml:"segment"`
SourceFile []struct {
ID string `xml:"id,attr"`
Name string `xml:"name,attr"`
} `xml:"source_file"`
Symbol []struct {
Address string `xml:"address,attr"`
Datatype string `xml:"datatype,attr"`
Index string `xml:"index,attr"`
Kind string `xml:"kind,attr"`
Length string `xml:"length,attr"`
Name string `xml:"name,attr"`
Tag string `xml:"tag,attr"`
Undecorated string `xml:"undecorated,attr"`
Value string `xml:"value,attr"`
} `xml:"symbol"`
} `xml:"table"`
} `xml:"tables"`
Typedefs struct {
Typedef []struct {
Basetype string `xml:"basetype,attr"`
Name string `xml:"name,attr"`
} `xml:"typedef"`
} `xml:"typedefs"`
}

View file

@ -0,0 +1,174 @@
package stringrecovery
import (
"debug/macho"
"debug/pe"
"encoding/binary"
"errors"
"fmt"
"io"
"os"
"strings"
)
var (
charset_english = ` !"#$%&\'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_abcdefghijklmnopqrstuvwxyz{|}~` + "`"
lookup_table_english [256]bool
)
type (
Callback func(segment_name string, address uint64, token string)
)
func init() {
for _, c := range charset_english {
lookup_table_english[c] = true
}
}
func recover_section(try_align bool, word_size uint64, minimum_length int, section_name string, virtual_address uint64, section_reader io.ReaderAt, callback Callback) (err error) {
var (
offset int64
)
var (
current_token strings.Builder
current_token_offset int64
)
for {
var b [1]byte
if _, err = section_reader.ReadAt(b[:], offset); err != nil {
if err == io.EOF {
err = nil
break
}
}
if b[0] == 0 {
// if current_token != "", this is a 0-terminator
// emit the token
if current_token.Len() > 0 {
if current_token.Len() < minimum_length {
current_token.Reset()
offset++
continue
}
align_offset := 0
if try_align {
for i := uint64(current_token_offset); (i % word_size) != 0; i++ {
current_token_offset++
align_offset++
}
}
current_token_string := current_token.String()
current_token_string = current_token_string[align_offset:]
callback(section_name, virtual_address+uint64(current_token_offset), current_token_string)
current_token.Reset()
}
offset++
continue
}
if lookup_table_english[b[0]] {
if current_token.Len() == 0 {
current_token_offset = offset
}
current_token.WriteByte(b[0])
} else {
// discard everything leaing up to this
current_token.Reset()
}
offset++
}
return
}
func recover_file_macho(word_size uint64, file *os.File, callback Callback) (err error) {
var (
macho_file *macho.File
)
macho_file, err = macho.NewFile(file)
if err != nil {
return
}
for _, section := range macho_file.Sections {
fmt.Fprintln(os.Stderr, "recovering", section.Name)
switch section.Name {
case "__cstring":
if err = recover_section(false, word_size, 1, section.Name, section.Addr, section, callback); err != nil {
return
}
case "__const":
if err = recover_section(false, word_size, 4, section.Name, section.Addr, section, callback); err != nil {
return
}
}
}
return
}
func recover_file_pe(file *os.File, callback Callback) (err error) {
var (
pe_file *pe.File
)
pe_file, err = pe.NewFile(file)
if err != nil {
return
}
image_base := uint64(0x400000)
var word_size uint64
switch h := pe_file.OptionalHeader.(type) {
case *pe.OptionalHeader32:
word_size = 4
image_base = uint64(h.ImageBase)
case *pe.OptionalHeader64:
word_size = 8
image_base = h.ImageBase
}
for _, section := range pe_file.Sections {
fmt.Fprintln(os.Stderr, "recovering", section.Name)
switch section.Name {
case ".data":
if err = recover_section(true, word_size, 4, section.Name, image_base+uint64(section.VirtualAddress), section, callback); err != nil {
return
}
case ".rdata":
if err = recover_section(true, word_size, 4, section.Name, image_base+uint64(section.VirtualAddress), section, callback); err != nil {
return
}
}
}
return
}
func RecoverFile(filename string, callback Callback) (err error) {
var file *os.File
file, err = os.Open(filename)
if err != nil {
return
}
var magic [4]byte
if _, err = file.ReadAt(magic[:], 0); err != nil {
return
}
magic_number := binary.LittleEndian.Uint32(magic[:])
if magic[0] == 'M' && magic[1] == 'Z' {
err = recover_file_pe(file, callback)
} else if magic_number == 0xfeedface {
err = recover_file_macho(4, file, callback)
} else if magic_number == 0xfeedfacf {
err = recover_file_macho(8, file, callback)
} else if magic_number == 0xcefaedfe {
err = recover_file_macho(4, file, callback)
} else {
err = errors.New("unknown file magic: " + filename)
}
return
}