From 2c2815ab0b4b9c9e68f3d457f52a104d979e6ce2 Mon Sep 17 00:00:00 2001 From: superp00t Date: Fri, 20 Mar 2026 01:58:16 -0400 Subject: [PATCH] feat(binana): add tokens database --- db/samples.parquet | 3 + db/tokens.parquet | 3 + go.mod | 12 +- go.sum | 21 ++ go/app/cmd/execute.go | 3 + go/app/cmd/make_samples/make-samples.go | 68 ++++ go/app/cmd/make_tokens/make-tokens.go | 57 +++ go/app/cmd/query/query.go | 72 ++++ go/app/util/dbutil/format.go | 14 + go/app/util/dbutil/writer.go | 80 ++++ go/app/util/demangle.go | 49 +++ go/app/util/exe.go | 31 ++ go/app/util/hash.go | 19 + go/app/util/make-samples.go | 224 +++++++++++ go/app/util/make-tokens.go | 303 +++++++++++++++ go/app/util/pdb.go | 474 ++++++++++++++++++++++++ go/app/util/query.go | 189 ++++++++++ go/db/sample.go | 47 +++ go/db/token.go | 79 ++++ go/pdbconv/db.go | 109 ++++++ go/pdbconv/xml.go | 93 +++++ go/stringrecovery/stringrecovery.go | 174 +++++++++ 22 files changed, 2122 insertions(+), 2 deletions(-) create mode 100644 db/samples.parquet create mode 100644 db/tokens.parquet create mode 100644 go/app/cmd/make_samples/make-samples.go create mode 100644 go/app/cmd/make_tokens/make-tokens.go create mode 100644 go/app/cmd/query/query.go create mode 100644 go/app/util/dbutil/format.go create mode 100644 go/app/util/dbutil/writer.go create mode 100644 go/app/util/demangle.go create mode 100644 go/app/util/exe.go create mode 100644 go/app/util/hash.go create mode 100644 go/app/util/make-samples.go create mode 100644 go/app/util/make-tokens.go create mode 100644 go/app/util/pdb.go create mode 100644 go/app/util/query.go create mode 100644 go/db/sample.go create mode 100644 go/db/token.go create mode 100644 go/pdbconv/db.go create mode 100644 go/pdbconv/xml.go create mode 100644 go/stringrecovery/stringrecovery.go diff --git a/db/samples.parquet b/db/samples.parquet new file mode 100644 index 0000000..7dcf0ad --- /dev/null +++ b/db/samples.parquet @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f19fac1fbb4db2383995a0285a30e1826e567e4198e35137ac773e0bad516401 +size 6011 diff --git a/db/tokens.parquet b/db/tokens.parquet new file mode 100644 index 0000000..9ea84cd --- /dev/null +++ b/db/tokens.parquet @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8c31761b8675ce1fee186061dc84c184b5eb23ec92368230abbf611ffc9143f7 +size 156789790 diff --git a/go.mod b/go.mod index 52fec35..bba204b 100644 --- a/go.mod +++ b/go.mod @@ -4,18 +4,26 @@ go 1.25.5 require ( github.com/fatih/color v1.18.0 - github.com/pierrec/lz4/v4 v4.1.21 + github.com/pierrec/lz4/v4 v4.1.26 github.com/spf13/cobra v1.8.1 modernc.org/cc/v3 v3.41.0 ) require ( + github.com/andybalholm/brotli v1.2.0 // indirect + github.com/google/uuid v1.6.0 // indirect github.com/inconshreveable/mousetrap v1.1.0 // indirect + github.com/klauspost/compress v1.18.4 // indirect github.com/mattn/go-colorable v0.1.13 // indirect github.com/mattn/go-isatty v0.0.20 // indirect + github.com/parquet-go/bitpack v1.0.0 // indirect + github.com/parquet-go/jsonlite v1.5.0 // indirect + github.com/parquet-go/parquet-go v0.29.0 // indirect github.com/remyoudompheng/bigfft v0.0.0-20230129092748-24d4a6f8daec // indirect github.com/spf13/pflag v1.0.5 // indirect - golang.org/x/sys v0.25.0 // indirect + github.com/twpayne/go-geom v1.6.1 // indirect + golang.org/x/sys v0.42.0 // indirect + google.golang.org/protobuf v1.36.11 // indirect lukechampine.com/uint128 v1.3.0 // indirect modernc.org/mathutil v1.6.0 // indirect modernc.org/strutil v1.2.0 // indirect diff --git a/go.sum b/go.sum index de33cfe..af7c3ca 100644 --- a/go.sum +++ b/go.sum @@ -1,3 +1,5 @@ +github.com/andybalholm/brotli v1.2.0 h1:ukwgCxwYrmACq68yiUqwIWnGY0cTPox/M94sVwToPjQ= +github.com/andybalholm/brotli v1.2.0/go.mod h1:rzTDkvFWvIrjDXZHkuS16NPggd91W3kUSvPlQ1pLaKY= github.com/cpuguy83/go-md2man/v2 v2.0.4/go.mod h1:tgQtvFlXSQOSOSIRvRPT7W67SCa46tRHOmNcaadrF8o= github.com/dustin/go-humanize v1.0.1 h1:GzkhY7T5VNhEkwH0PVJgjz+fX1rhBrR7pRT3mDkpeCY= github.com/dustin/go-humanize v1.0.1/go.mod h1:Mu1zIs6XwVuF/gI1OepvI0qD18qycQx+mFykh5fBlto= @@ -5,15 +7,27 @@ github.com/fatih/color v1.18.0 h1:S8gINlzdQ840/4pfAwic/ZE0djQEH3wM94VfqLTZcOM= github.com/fatih/color v1.18.0/go.mod h1:4FelSpRwEGDpQ12mAdzqdOukCy4u8WUtOY6lkT/6HfU= github.com/google/go-cmp v0.5.9 h1:O2Tfq5qg4qc4AmwVlvv0oLiVAGB7enBSJ2x2DqQFi38= github.com/google/go-cmp v0.5.9/go.mod h1:17dUlkBOakJ0+DkrSSNjCkIjxS6bF9zb3elmeNGIjoY= +github.com/google/uuid v1.6.0 h1:NIvaJDMOsjHA8n1jAhLSgzrAzy1Hgr+hNrb57e+94F0= +github.com/google/uuid v1.6.0/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo= github.com/inconshreveable/mousetrap v1.1.0 h1:wN+x4NVGpMsO7ErUn/mUI3vEoE6Jt13X2s0bqwp9tc8= github.com/inconshreveable/mousetrap v1.1.0/go.mod h1:vpF70FUmC8bwa3OWnCshd2FqLfsEA9PFc4w1p2J65bw= +github.com/klauspost/compress v1.18.4 h1:RPhnKRAQ4Fh8zU2FY/6ZFDwTVTxgJ/EMydqSTzE9a2c= +github.com/klauspost/compress v1.18.4/go.mod h1:R0h/fSBs8DE4ENlcrlib3PsXS61voFxhIs2DeRhCvJ4= github.com/mattn/go-colorable v0.1.13 h1:fFA4WZxdEF4tXPZVKMLwD8oUnCTTo08duU7wxecdEvA= github.com/mattn/go-colorable v0.1.13/go.mod h1:7S9/ev0klgBDR4GtXTXX8a3vIGJpMovkB8vQcUbaXHg= github.com/mattn/go-isatty v0.0.16/go.mod h1:kYGgaQfpe5nmfYZH+SKPsOc2e4SrIfOl2e/yFXSvRLM= github.com/mattn/go-isatty v0.0.20 h1:xfD0iDuEKnDkl03q4limB+vH+GxLEtL/jb4xVJSWWEY= github.com/mattn/go-isatty v0.0.20/go.mod h1:W+V8PltTTMOvKvAeJH7IuucS94S2C6jfK/D7dTCTo3Y= +github.com/parquet-go/bitpack v1.0.0 h1:AUqzlKzPPXf2bCdjfj4sTeacrUwsT7NlcYDMUQxPcQA= +github.com/parquet-go/bitpack v1.0.0/go.mod h1:XnVk9TH+O40eOOmvpAVZ7K2ocQFrQwysLMnc6M/8lgs= +github.com/parquet-go/jsonlite v1.5.0 h1:ulS7lNWdPwiqDMLzTiXHYmIUhu99mavZh2iAVdXet3g= +github.com/parquet-go/jsonlite v1.5.0/go.mod h1:nDjpkpL4EOtqs6NQugUsi0Rleq9sW/OtC1NnZEnxzF0= +github.com/parquet-go/parquet-go v0.29.0 h1:xXlPtFVR51jpSVzf+cgHnNIcb7Xet+iuvkbe0HIm90Y= +github.com/parquet-go/parquet-go v0.29.0/go.mod h1:navtkAYr2LGoJVp141oXPlO/sxLvaOe3la2JEoD8+rg= github.com/pierrec/lz4/v4 v4.1.21 h1:yOVMLb6qSIDP67pl/5F7RepeKYu/VmTyEXvuMI5d9mQ= github.com/pierrec/lz4/v4 v4.1.21/go.mod h1:gZWDp/Ze/IJXGXf23ltt2EXimqmTUXEy0GFuRQyBid4= +github.com/pierrec/lz4/v4 v4.1.26 h1:GrpZw1gZttORinvzBdXPUXATeqlJjqUG/D87TKMnhjY= +github.com/pierrec/lz4/v4 v4.1.26/go.mod h1:EoQMVJgeeEOMsCqCzqFm2O0cJvljX2nGZjcRIPL34O4= github.com/remyoudompheng/bigfft v0.0.0-20230129092748-24d4a6f8daec h1:W09IVJc94icq4NjY3clb7Lk8O1qJ8BdBEF8z0ibU0rE= github.com/remyoudompheng/bigfft v0.0.0-20230129092748-24d4a6f8daec/go.mod h1:qqbHyh8v60DhA7CoWK5oRCqLrMHRGoxYCSS9EjAz6Eo= github.com/russross/blackfriday/v2 v2.1.0/go.mod h1:+Rmxgy9KzJVeS9/2gXHxylqXiyQDYRxCVz55jmeOWTM= @@ -21,10 +35,17 @@ github.com/spf13/cobra v1.8.1 h1:e5/vxKd/rZsfSJMUX1agtjeTDf+qv1/JdBF8gg5k9ZM= github.com/spf13/cobra v1.8.1/go.mod h1:wHxEcudfqmLYa8iTfL+OuZPbBZkmvliBWKIezN3kD9Y= github.com/spf13/pflag v1.0.5 h1:iy+VFUOCP1a+8yFto/drg2CJ5u0yRoB7fZw3DKv/JXA= github.com/spf13/pflag v1.0.5/go.mod h1:McXfInJRrz4CZXVZOBLb0bTZqETkiAhM9Iw0y3An2Bg= +github.com/twpayne/go-geom v1.6.1 h1:iLE+Opv0Ihm/ABIcvQFGIiFBXd76oBIar9drAwHFhR4= +github.com/twpayne/go-geom v1.6.1/go.mod h1:Kr+Nly6BswFsKM5sd31YaoWS5PeDDH2NftJTK7Gd028= +github.com/xyproto/randomstring v1.0.5/go.mod h1:rgmS5DeNXLivK7YprL0pY+lTuhNQW3iGxZ18UQApw/E= golang.org/x/sys v0.0.0-20220811171246-fbc7d0a398ab/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.6.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.25.0 h1:r+8e+loiHxRqhXVl6ML1nO3l1+oFoWbnlu2Ehimmi34= golang.org/x/sys v0.25.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA= +golang.org/x/sys v0.42.0 h1:omrd2nAlyT5ESRdCLYdm3+fMfNFE/+Rf4bDIQImRJeo= +golang.org/x/sys v0.42.0/go.mod h1:4GL1E5IUh+htKOUEOaiffhrAeqysfVGipDYzABqnCmw= +google.golang.org/protobuf v1.36.11 h1:fV6ZwhNocDyBLK0dj+fg8ektcVegBBuEolpbTQyBNVE= +google.golang.org/protobuf v1.36.11/go.mod h1:HTf+CrKn2C3g5S8VImy6tdcUvCska2kB7j23XfzDpco= gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= lukechampine.com/uint128 v1.3.0 h1:cDdUVfRwDUDovz610ABgFD17nXD4/uDgVHl2sC3+sbo= diff --git a/go/app/cmd/execute.go b/go/app/cmd/execute.go index 04eb251..1af67f1 100644 --- a/go/app/cmd/execute.go +++ b/go/app/cmd/execute.go @@ -4,6 +4,9 @@ import ( _ "github.com/thunderbrewhq/binana/go/app/cmd/add_symbol" _ "github.com/thunderbrewhq/binana/go/app/cmd/lint" _ "github.com/thunderbrewhq/binana/go/app/cmd/make" + _ "github.com/thunderbrewhq/binana/go/app/cmd/make_samples" + _ "github.com/thunderbrewhq/binana/go/app/cmd/make_tokens" + _ "github.com/thunderbrewhq/binana/go/app/cmd/query" "github.com/thunderbrewhq/binana/go/app/cmd/root" _ "github.com/thunderbrewhq/binana/go/app/cmd/tidy" diff --git a/go/app/cmd/make_samples/make-samples.go b/go/app/cmd/make_samples/make-samples.go new file mode 100644 index 0000000..3e33fe4 --- /dev/null +++ b/go/app/cmd/make_samples/make-samples.go @@ -0,0 +1,68 @@ +package make_samples + +import ( + "github.com/spf13/cobra" + "github.com/thunderbrewhq/binana/go/app" + "github.com/thunderbrewhq/binana/go/app/cmd/root" + "github.com/thunderbrewhq/binana/go/app/util" + "github.com/thunderbrewhq/binana/go/app/util/dbutil" +) + +var make_samples_cmd = cobra.Command{ + Use: "make-samples", + Run: run_make_samples_command, +} + +func init() { + f := make_samples_cmd.Flags() + f.StringP("source", "s", "", "required: source tree of sample binaries") + f.StringP("output-file", "o", "", "write the database to a file") + f.StringSlice("direct-mirror", nil, "list of direct mirror URLs that already contain the sample binaries") + f.StringSlice("ipfs-gateway", nil, "list of IPFS gateways") + f.StringP("format", "f", "json", "the format of the output database [json|parquet]") + root.RootCmd.AddCommand(&make_samples_cmd) +} + +func run_make_samples_command(cmd *cobra.Command, args []string) { + f := cmd.Flags() + var ( + params util.MakeSampleDatabaseParams + err error + format string + ) + params.Source, err = f.GetString("source") + if err != nil { + app.Fatal(err) + } + if params.Source == "" { + cmd.Help() + return + } + params.Output, err = f.GetString("output-file") + if err != nil { + app.Fatal(err) + } + format, err = f.GetString("format") + if err != nil { + app.Fatal(err) + } + switch format { + case "json": + params.Format = dbutil.DatabaseJSON + case "parquet": + params.Format = dbutil.DatabaseParquet + default: + app.Fatal("unknown format", format) + } + + params.DirectMirrors, err = f.GetStringSlice("direct-mirror") + if err != nil { + app.Fatal(err) + } + params.IPFSGateways, err = f.GetStringSlice("ipfs-gateway") + if err != nil { + app.Fatal(err) + } + + util.MakeSampleDatabase(¶ms) +} diff --git a/go/app/cmd/make_tokens/make-tokens.go b/go/app/cmd/make_tokens/make-tokens.go new file mode 100644 index 0000000..218a40c --- /dev/null +++ b/go/app/cmd/make_tokens/make-tokens.go @@ -0,0 +1,57 @@ +package make_tokens + +import ( + "github.com/spf13/cobra" + "github.com/thunderbrewhq/binana/go/app" + "github.com/thunderbrewhq/binana/go/app/cmd/root" + "github.com/thunderbrewhq/binana/go/app/util" + "github.com/thunderbrewhq/binana/go/app/util/dbutil" +) + +var make_tokens_cmd = cobra.Command{ + Use: "make-tokens", + Run: run_make_tokens_command, +} + +func init() { + f := make_tokens_cmd.Flags() + f.StringP("source", "s", "", "required: source tree of sample binaries") + f.StringP("output-file", "o", "", "write the database to a file") + f.StringP("format", "f", "json", "the format of the output database [json|parquet]") + root.RootCmd.AddCommand(&make_tokens_cmd) +} + +func run_make_tokens_command(cmd *cobra.Command, args []string) { + f := cmd.Flags() + var ( + params util.MakeTokenDatabaseParams + err error + format string + ) + params.Source, err = f.GetString("source") + if err != nil { + app.Fatal(err) + } + if params.Source == "" { + cmd.Help() + return + } + params.Output, err = f.GetString("output-file") + if err != nil { + app.Fatal(err) + } + format, err = f.GetString("format") + if err != nil { + app.Fatal(err) + } + switch format { + case "json": + params.Format = dbutil.DatabaseJSON + case "parquet": + params.Format = dbutil.DatabaseParquet + default: + app.Fatal("unknown format", format) + } + + util.MakeTokenDatabase(¶ms) +} diff --git a/go/app/cmd/query/query.go b/go/app/cmd/query/query.go new file mode 100644 index 0000000..2253411 --- /dev/null +++ b/go/app/cmd/query/query.go @@ -0,0 +1,72 @@ +package query + +import ( + "math" + + "github.com/spf13/cobra" + "github.com/thunderbrewhq/binana/go/app" + "github.com/thunderbrewhq/binana/go/app/cmd/root" + "github.com/thunderbrewhq/binana/go/app/util" +) + +var query_cmd = cobra.Command{ + Use: "q regexp", + Args: cobra.MinimumNArgs(1), + Short: "query the token database for information", + Run: run_query_cmd, +} + +func init() { + f := query_cmd.Flags() + f.Uint32("min-build", 0, "the minimum build to return tokens for") + f.Uint32("max-build", math.MaxUint32, "the maximum build to return tokens for") + f.StringSlice("program", nil, "a list of programs to return tokens for") + f.StringSlice("os", nil, "a list of kernel names to return tokens for (windows, darwin, linux)") + f.StringSlice("arch", nil, "a list of CPU architectures to return tokens for (ppc, 386, amd64)") + f.String("present", "normal", "control the way tokens are presented to console (normal, name-only)") + root.RootCmd.AddCommand(&query_cmd) +} + +func run_query_cmd(cmd *cobra.Command, args []string) { + f := cmd.Flags() + var ( + params util.QueryParams + err error + presentation_mode string + ) + params.MinBuild, err = f.GetUint32("min-build") + if err != nil { + app.Fatal(err) + } + params.MaxBuild, err = f.GetUint32("max-build") + if err != nil { + app.Fatal(err) + } + params.Program, err = f.GetStringSlice("program") + if err != nil { + app.Fatal(err) + } + params.OS, err = f.GetStringSlice("os") + if err != nil { + return + } + params.Arch, err = f.GetStringSlice("arch") + if err != nil { + return + } + presentation_mode, err = f.GetString("present") + if err != nil { + return + } + switch presentation_mode { + case "normal": + params.Present = util.PresentQueryNormal + case "name-only": + params.Present = util.PresentQueryNameOnly + default: + cmd.Help() + return + } + params.Token = args[0] + util.Query(¶ms) +} diff --git a/go/app/util/dbutil/format.go b/go/app/util/dbutil/format.go new file mode 100644 index 0000000..c7c8723 --- /dev/null +++ b/go/app/util/dbutil/format.go @@ -0,0 +1,14 @@ +package dbutil + +import "errors" + +type DatabaseFormat uint8 + +const ( + DatabaseParquet DatabaseFormat = iota + DatabaseJSON +) + +var ( + ErrUnknownDatabaseFormat = errors.New("dbutil: unknown database format") +) diff --git a/go/app/util/dbutil/writer.go b/go/app/util/dbutil/writer.go new file mode 100644 index 0000000..0a89478 --- /dev/null +++ b/go/app/util/dbutil/writer.go @@ -0,0 +1,80 @@ +package dbutil + +import ( + "encoding/json" + "fmt" + "io" + "os" + + "github.com/parquet-go/parquet-go" +) + +type Writer[T any] struct { + write func([]T) (err error) + close func() (err error) +} + +func (writer *Writer[T]) WriteEntries(entries []T) (err error) { + err = writer.write(entries) + return +} + +func (writer *Writer[T]) Close() (err error) { + err = writer.close() + return +} + +func Open[T any](name string, format DatabaseFormat) (writer *Writer[T], err error) { + writer = new(Writer[T]) + var ( + file *os.File + output io.Writer + ) + if name == "" { + output = os.Stdout + } else { + file, err = os.Create(name) + if err != nil { + return + } + output = file + } + switch format { + + case DatabaseJSON: + encoder := json.NewEncoder(output) + writer.write = func(entries []T) (err error) { + for _, entry := range entries { + if err = encoder.Encode(&entry); err != nil { + return + } + } + return + } + writer.close = func() (err error) { + if file != nil { + err = file.Close() + } + return + } + case DatabaseParquet: + generic_writer := parquet.NewGenericWriter[T](output) + writer.write = func(entries []T) (err error) { + _, err = generic_writer.Write(entries) + return + } + writer.close = func() (err error) { + if err = generic_writer.Close(); err != nil { + return + } + if file != nil { + err = file.Close() + } + + return + } + default: + err = fmt.Errorf("%w: %d", ErrUnknownDatabaseFormat, format) + } + return +} diff --git a/go/app/util/demangle.go b/go/app/util/demangle.go new file mode 100644 index 0000000..53dc55f --- /dev/null +++ b/go/app/util/demangle.go @@ -0,0 +1,49 @@ +package util + +import ( + "bytes" + "fmt" + "os/exec" + "strings" +) + +var demangle_cache = make(map[string]string) + +func demangle(str string) (demangled string, err error) { + var ok bool + if demangled, ok = demangle_cache[str]; ok { + return + } + + var output bytes.Buffer + c := exec.Command("demumble", str) + c.Stdout = &output + c.Run() + if output.Len() == 0 { + err = fmt.Errorf("cannot demangle") + return + } + demangled = output.String() + demangled = strings.TrimSuffix(demangled, "\n") + if str == demangled { + err = fmt.Errorf("cannot demangle") + return + } + demangle_cache[str] = demangled + return +} + +func looks_mangled(str string) bool { + if strings.HasPrefix(str, ".") { + return true + } else if strings.HasPrefix(str, "?") { + return true + } else if strings.HasPrefix(str, "_Z") { + return true + } else if len(str) > 0 && str[0] >= '0' && str[0] <= '9' { + return true + } else if strings.ContainsAny(str, "0123456789") && !strings.Contains(str, " ") { + return true + } + return false +} diff --git a/go/app/util/exe.go b/go/app/util/exe.go new file mode 100644 index 0000000..da38421 --- /dev/null +++ b/go/app/util/exe.go @@ -0,0 +1,31 @@ +package util + +import ( + "debug/pe" + "os" +) + +func get_exe_base_address(name string) (base_address uint64, err error) { + var file *os.File + file, err = os.Open(name) + if err != nil { + return + } + var ( + pe_file *pe.File + ) + pe_file, err = pe.NewFile(file) + if err != nil { + return + } + base_address = uint64(0x400000) + + switch h := pe_file.OptionalHeader.(type) { + case *pe.OptionalHeader32: + base_address = uint64(h.ImageBase) + case *pe.OptionalHeader64: + base_address = h.ImageBase + } + file.Close() + return +} diff --git a/go/app/util/hash.go b/go/app/util/hash.go new file mode 100644 index 0000000..5edd9d6 --- /dev/null +++ b/go/app/util/hash.go @@ -0,0 +1,19 @@ +package util + +import ( + "crypto/sha256" + "encoding/hex" + "os" +) + +func hash_file(name string) (id string, err error) { + var b []byte + b, err = os.ReadFile(name) + if err != nil { + return + } + h := sha256.New() + h.Write(b[:]) + id = hex.EncodeToString(h.Sum(nil)) + return +} diff --git a/go/app/util/make-samples.go b/go/app/util/make-samples.go new file mode 100644 index 0000000..ff33328 --- /dev/null +++ b/go/app/util/make-samples.go @@ -0,0 +1,224 @@ +package util + +import ( + "bufio" + "bytes" + "fmt" + "os" + "os/exec" + "path/filepath" + "strconv" + "strings" + + "github.com/thunderbrewhq/binana/go/app" + "github.com/thunderbrewhq/binana/go/app/util/dbutil" + "github.com/thunderbrewhq/binana/go/db" +) + +type MakeSampleDatabaseParams struct { + // A file name that corresponds to a tree of sample files. + // Anything in this tree will be collected into the sample database + Source string + + // The name of the file to write the database to + Output string + + // Sets the format of the database file + Format dbutil.DatabaseFormat + + // URLs that maps to the root of the sample tree hierarchy. + // Used to generate a list of mirror URLs for sample binaries + DirectMirrors []string + + // List of IPFS Gateway URLs + // If not empty, a CID for the sample tree will be created, + // Actually uploading anything in the sample tree, however, + // is outside the scope of this tool + IPFSGateways []string +} + +type sample_database struct { + writer *dbutil.Writer[db.Sample] + ipfs_tree_cid string + buffer []db.Sample +} + +func (sample_database *sample_database) add(sample db.Sample) (err error) { + sample_database.buffer = append(sample_database.buffer, sample) + return +} + +func (sample_database *sample_database) Close() (err error) { + if err = sample_database.writer.WriteEntries(sample_database.buffer); err != nil { + return + } + err = sample_database.writer.Close() + return +} + +func (sample_database *sample_database) make_sample_file(params *MakeSampleDatabaseParams, name, relative_name string) (err error) { + var sample db.Sample + // infer mime-type from extension + switch filepath.Ext(name) { + case ".exe": + sample.MimeType = "application/vnd.microsoft.portable-executable" + case ".pdb": + sample.MimeType = "application/x-ms-pdb" + // associate the PDB with its EXE + sample_exe_name := strings.TrimSuffix(name, ".pdb") + ".exe" + if _, err = os.Stat(sample_exe_name); err == nil { + sample.Executable, err = hash_file(sample_exe_name) + if err != nil { + panic(err) + } + } + case ".macho": + sample.MimeType = "application/x-mach-binary" + case ".elf": + sample.MimeType = "application/x-executable" + default: + // don't care about this + return + } + + sample.ID, err = hash_file(name) + if err != nil { + panic(err) + } + + // get the base filename + base_name := filepath.Base(name) + + // split the base filename without its extension + filename_components := strings.Split(strings.TrimSuffix(base_name, filepath.Ext(base_name)), "-") + // now, parse the filename (these must be correctly named!) + sample.Program = filename_components[0] + sample.Version = filename_components[1] + var build uint64 + build, err = strconv.ParseUint(filename_components[2], 0, 64) + if err != nil { + panic(err) + } + sample.Build = uint32(build) + sample.OS = filename_components[3] + sample.Arch = filename_components[4] + + // now, create various mirrors + for _, direct_mirror := range params.DirectMirrors { + sample.Mirrors = append(sample.Mirrors, db.SampleMirror{ + Kind: db.MirrorDirect, + URL: direct_mirror + relative_name, + }) + } + for _, ipfs_gateway := range params.IPFSGateways { + sample.Mirrors = append(sample.Mirrors, db.SampleMirror{ + Kind: db.MirrorIPFS, + URL: ipfs_gateway + "/" + sample_database.ipfs_tree_cid + relative_name, + }) + } + + // now write the sample + + if err = sample_database.add(sample); err != nil { + return + } + return +} + +func (sample_database *sample_database) make_tree(params *MakeSampleDatabaseParams, name, relative_name string) (err error) { + var ( + tree_entries []os.DirEntry + ) + + tree_entries, err = os.ReadDir(name) + if err != nil { + return + } + + for _, tree_entry := range tree_entries { + if tree_entry.IsDir() { + if err = sample_database.make_tree(params, name+"/"+tree_entry.Name(), relative_name+"/"+tree_entry.Name()); err != nil { + return + } + } else { + if err = sample_database.make_sample_file(params, name+"/"+tree_entry.Name(), relative_name+"/"+tree_entry.Name()); err != nil { + return + } + } + } + + return +} + +func ipfs_generate_file_cid(name string) (cid string, err error) { + + // todo + // use command: + // ipfs add -qr --only-hash . + // inside the root of the sample tree + // the last CID is the root of the tree + + var ( + wd string + ) + wd, err = os.Getwd() + if err != nil { + return + } + err = os.Chdir(name) + if err != nil { + return + } + + command := exec.Command("ipfs", "add", "-qr", "--only-hash", ".") + var command_output bytes.Buffer + command.Stdout = &command_output + command.Run() + if command.ProcessState.ExitCode() != 0 { + os.Chdir(wd) + err = fmt.Errorf("util: ipfs tool exited: %d", command.ProcessState.ExitCode()) + return + } + + // Parse command Output + command_output_scanner := bufio.NewScanner(&command_output) + + for command_output_scanner.Scan() { + cid = command_output_scanner.Text() + } + + err = os.Chdir(wd) + + return + +} + +func MakeSampleDatabase(params *MakeSampleDatabaseParams) { + var ( + err error + sample_database sample_database + ) + + // if we want to generate IPFS links, start by getting the CID for the sample tree + if len(params.IPFSGateways) != 0 { + sample_database.ipfs_tree_cid, err = ipfs_generate_file_cid(params.Source) + if err != nil { + app.Fatal(err) + return + } + } + + sample_database.writer, err = dbutil.Open[db.Sample](params.Output, params.Format) + if err != nil { + app.Fatal(err) + } + + // make the root tree, with our params, the source as the first tree, and "" (root) as the relative path + if err = sample_database.make_tree(params, params.Source, ""); err != nil { + app.Fatal(err) + } + + if err = sample_database.Close(); err != nil { + app.Fatal(err) + } +} diff --git a/go/app/util/make-tokens.go b/go/app/util/make-tokens.go new file mode 100644 index 0000000..fd61544 --- /dev/null +++ b/go/app/util/make-tokens.go @@ -0,0 +1,303 @@ +package util + +import ( + "compress/gzip" + "debug/macho" + "encoding/json" + "fmt" + "os" + "path/filepath" + "slices" + "strings" + "time" + + "github.com/thunderbrewhq/binana/go/app" + "github.com/thunderbrewhq/binana/go/app/util/dbutil" + "github.com/thunderbrewhq/binana/go/db" + "github.com/thunderbrewhq/binana/go/pdbconv" + "github.com/thunderbrewhq/binana/go/stringrecovery" +) + +type MakeTokenDatabaseParams struct { + Source string + Output string + Format dbutil.DatabaseFormat +} + +func MakeTokenDatabase(params *MakeTokenDatabaseParams) { + var ( + tokens_database tokens_database + err error + ) + if err = tokens_database.Open(params.Output, params.Format); err != nil { + app.Fatal(err) + } + if err = tokens_database.make(params.Source); err != nil { + app.Fatal(err) + } + if err = tokens_database.Close(); err != nil { + app.Fatal(err) + } +} + +type tokens_database struct { + sequence uint64 + writer *dbutil.Writer[db.Token] +} + +func (tokens_database *tokens_database) next_token_id() (id uint64) { + id = tokens_database.sequence + tokens_database.sequence++ + return +} + +func (tokens_database *tokens_database) Open(name string, format dbutil.DatabaseFormat) (err error) { + tokens_database.sequence = 1 + tokens_database.writer, err = dbutil.Open[db.Token](name, format) + return +} + +func (tokens_database *tokens_database) Close() (err error) { + err = tokens_database.writer.Close() + return +} + +func (tokens_database *tokens_database) Write(token *db.Token) (err error) { + tokens := []db.Token{*token} + if err = tokens_database.writer.WriteEntries(tokens); err != nil { + return + } + return +} + +func (tokens_database *tokens_database) make_file_pdb(name string) (err error) { + exe_name := strings.TrimSuffix(name, ".pdb") + ".exe" + var base_address uint64 + base_address, err = get_exe_base_address(exe_name) + if err != nil { + return + } + + fmt.Fprintln(os.Stderr, "[pdb]", name) + var source_id string + source_id, err = hash_file(name) + if err != nil { + return + } + fmt.Fprintln(os.Stderr, "[pdb]", source_id) + // check for the existence of an alternate, .pdb.json.gz file + _, err = os.Stat(name + ".json.gz") + if err != nil { + return + } + + var ( + gzip_file *os.File + gzip_reader *gzip.Reader + ) + gzip_file, err = os.Open(name + ".json.gz") + if err != nil { + return + } + + var pdb pdbconv.ProgramDatabase + gzip_reader, err = gzip.NewReader(gzip_file) + json_decoder := json.NewDecoder(gzip_reader) + + if err = json_decoder.Decode(&pdb); err != nil { + return + } + + gzip_file.Close() + + var v pdb_token_visitor + v.init(tokens_database, source_id, base_address) + if err = v.visit_all(&pdb); err != nil { + return + } + if err = v.write_tokens(); err != nil { + return + } + + return +} + +func (tokens_database *tokens_database) write_string_token(source_id string, section_name string, address uint64, str string) (err error) { + var db_token db.Token + db_token.ID = tokens_database.next_token_id() + db_token.Source = source_id + db_token.Section = section_name + db_token.Kind = db.OriginalStringToken + db_token.Offset = fmt.Sprintf("%X", address) + + db_token.Names = append(db_token.Names, db.TokenName{db.OriginalName, str}) + + // detect if this is a mangled type identifier + if looks_mangled(str) { + demangled, err := demangle(str) + if err == nil { + db_token.Names = append(db_token.Names, db.TokenName{db.DemangledName, demangled}) + } + } + + err = tokens_database.Write(&db_token) + return +} + +func (tokens_database *tokens_database) make_file_pe(name string) (err error) { + var id string + id, err = hash_file(name) + if err != nil { + return + } + err = stringrecovery.RecoverFile(name, func(section_name string, address uint64, str string) { + fmt.Fprintf(os.Stderr, "[pe] string found: %s %08X %s\n", section_name, address, str) + tokens_database.write_string_token(id, section_name, address, str) + }) + return +} + +func (tokens_database *tokens_database) make_file_macho(name string) (err error) { + fmt.Fprintln(os.Stderr, "[mach-o]", name) + var source_id string + source_id, err = hash_file(name) + if err != nil { + return + } + fmt.Fprintln(os.Stderr, "[mach-o]", source_id) + var ( + file *os.File + macho_file *macho.File + ) + file, err = os.Open(name) + if err != nil { + return + } + + macho_file, err = macho.NewFile(file) + if err != nil { + return + } + _, dwarf_err := macho_file.DWARF() + if dwarf_err == nil { + fmt.Fprintln(os.Stderr, "DWARF!") + time.Sleep(5 * time.Second) + } + fmt.Fprintln(os.Stderr, "[mach-o]", "cpu", macho_file.FileHeader.Cpu) + fmt.Fprintln(os.Stderr, "[mach-o]", "loads:") + // for _, load := range macho_file.Loads { + // fmt.Fprintln(os.Stderr, "[mach-o]", load.String()) + // } + fmt.Fprintln(os.Stderr, "[mach-o]", "sections:") + for _, section := range macho_file.Sections { + fmt.Fprintln(os.Stderr, "section", section.SectionHeader.Name) + } + + if macho_file.Dysymtab != nil { + fmt.Fprintln(os.Stderr, "[mach-o]", "does not contain a dysymtab") + } else { + fmt.Fprintln(os.Stderr, "[mach-o]", "dysymtab:") + } + + var imported_symbols []string + imported_symbols, err = macho_file.ImportedSymbols() + if err != nil { + return + } + if macho_file.Symtab == nil { + fmt.Fprintln(os.Stderr, "[mach-o]", "does not contain a symtab") + } else { + fmt.Fprintln(os.Stderr, "[mach-o]", "symtab:") + for _, sym := range macho_file.Symtab.Syms { + imported := slices.Contains(imported_symbols, sym.Name) + var section_name string + if sym.Sect != 0 { + section_name = macho_file.Sections[sym.Sect-1].SectionHeader.Name + } + if imported { + fmt.Fprintf(os.Stderr, "[mach-o] imported %s %02x %s\n", section_name, sym.Type, sym.Name) + } else { + if sym.Name == "" { + // fmt.Fprintln(os.Stderr, "[mach-o]", "symbol has no name", "sect="+section_name, sym.Type, sym.Value, sym.Desc) + } else { + fmt.Fprintf(os.Stderr, "[mach-o] internal %s %02x %s\n", section_name, sym.Type, sym.Name) + var token db.Token + token.ID = tokens_database.next_token_id() + token.Source = source_id + token.Kind = db.OriginalSymbolToken + token.Section = section_name + token.Offset = fmt.Sprintf("%X", sym.Value) + token.Names = append(token.Names, db.TokenName{db.OriginalName, sym.Name}) + + if looks_mangled(sym.Name) { + demangled, err := demangle(sym.Name) + if err == nil { + token.Names = append(token.Names, db.TokenName{db.DemangledName, demangled}) + } + } + + tokens_database.Write(&token) + } + } + } + } + + file.Close() + + if err = stringrecovery.RecoverFile(name, func(section_name string, address uint64, str string) { + fmt.Fprintf(os.Stderr, "[mach-o] string found: %s %08X %s\n", section_name, address, str) + tokens_database.write_string_token(source_id, section_name, address, str) + }); err != nil { + return + } + + return +} + +func (tokens_database *tokens_database) make_file(name string) (err error) { + switch filepath.Ext(name) { + case ".macho": + err = tokens_database.make_file_macho(name) + case ".pdb": + err = tokens_database.make_file_pdb(name) + case ".exe": + err = tokens_database.make_file_pe(name) + } + return +} + +func (tokens_database *tokens_database) make_directory(name string) (err error) { + var entries []os.DirEntry + entries, err = os.ReadDir(name) + if err != nil { + return + } + + for _, entry := range entries { + if entry.IsDir() { + if err = tokens_database.make_directory(filepath.Join(name, entry.Name())); err != nil { + return + } + } else { + if err = tokens_database.make_file(filepath.Join(name, entry.Name())); err != nil { + return + } + } + } + + return +} + +func (tokens_database *tokens_database) make(name string) (err error) { + var fi os.FileInfo + fi, err = os.Stat(name) + if err != nil { + return + } + if fi.IsDir() { + err = tokens_database.make_directory(name) + } else { + err = tokens_database.make_file(name) + } + return +} diff --git a/go/app/util/pdb.go b/go/app/util/pdb.go new file mode 100644 index 0000000..b2f7843 --- /dev/null +++ b/go/app/util/pdb.go @@ -0,0 +1,474 @@ +package util + +import ( + "crypto/sha256" + "encoding/hex" + "fmt" + "maps" + "slices" + "sort" + "strconv" + + "github.com/thunderbrewhq/binana/go/db" + "github.com/thunderbrewhq/binana/go/pdbconv" +) + +type pdb_token_visitor struct { + tokens_database *tokens_database + + // location of the base module + base_address uint64 + pdb_source_id string + strings map[string]*db.Token + datatypes map[string]*db.Token + // maps a symbol to a token + symbols map[uint64]*db.Token + + constants []db.Token +} + +func (v *pdb_token_visitor) init(tokens_database *tokens_database, pdb_source_id string, base_address uint64) { + v.base_address = base_address + v.pdb_source_id = pdb_source_id + v.strings = make(map[string]*db.Token) + v.datatypes = make(map[string]*db.Token) + // map of address to token + v.symbols = make(map[uint64]*db.Token) + v.tokens_database = tokens_database +} + +func (v *pdb_token_visitor) visit_class(class *pdbconv.Class) (err error) { + token, ok := v.datatypes[class.Name] + if !ok { + token = new(db.Token) + v.datatypes[class.Name] = token + } + + // set token source to pdb + token.Source = v.pdb_source_id + // kind is debug information token + token.Kind = db.OriginalDatatypeToken + + // set original name + token.Names = append(token.Names, db.TokenName{db.OriginalName, class.Name}) + + // set the basic type + token.Keyword = "class" + + for _, member := range class.Members { + var token_member db.TokenMember + if member.Kind == "Member" { + token_member.Kind = db.FieldMember + } else if member.Kind == "Unknown" && member.Datatype == "void *" { + token_member.Kind = db.MethodMember + } else { + continue + } + token_member.Key = member.Name + token_member.Value = member.Datatype + + if !slices.Contains(token.Members, token_member) { + token.Members = append(token.Members, token_member) + } + } + return +} + +func (v *pdb_token_visitor) visit_datatype(datatype *pdbconv.Datatype) (err error) { + token, ok := v.datatypes[datatype.Name] + if !ok { + token = new(db.Token) + v.datatypes[datatype.Name] = token + } + + // set token source to pdb + token.Source = v.pdb_source_id + // kind is debug information token + token.Kind = db.OriginalDatatypeToken + + // set original name + token.Names = append(token.Names, db.TokenName{db.OriginalName, datatype.Name}) + + // set the basic type + if datatype.Kind == "Structure" { + token.Keyword = "struct" + } else if datatype.Kind == "Union" { + token.Keyword = "union" + } else { + err = fmt.Errorf("unhandled datatype kind '%s'", datatype.Kind) + return + } + + for _, member := range datatype.Members { + var token_member db.TokenMember + if member.Kind == "Member" { + token_member.Kind = db.FieldMember + } else if member.Kind == "Unknown" && member.Datatype == "void *" { + token_member.Kind = db.MethodMember + } else { + continue + } + token_member.Key = member.Name + token_member.Value = member.Datatype + + if !slices.Contains(token.Members, token_member) { + token.Members = append(token.Members, token_member) + } + } + return +} + +func (v *pdb_token_visitor) visit_function(function *pdbconv.Function) (err error) { + var address uint64 + address, err = strconv.ParseUint(function.Address, 0, 64) + if err != nil { + return + } + + address = v.base_address + address + + token, ok := v.symbols[address] + if !ok { + token = new(db.Token) + v.symbols[address] = token + } + + // set token source to pdb + token.Source = v.pdb_source_id + // kind is symbol information token + token.Kind = db.OriginalSymbolToken + // set address + token.Offset = fmt.Sprintf("%X", address) + // set original name + token.Names = append(token.Names, db.TokenName{db.OriginalName, function.Name}) + + if looks_mangled(function.Name) { + demangled, demangler_err := demangle(function.Name) + if demangler_err == nil { + token.Names = append(token.Names, db.TokenName{db.DemangledName, demangled}) + } + } + + // visit source files + for _, line_number := range function.LineNumbers { + if err = v.visit_string(line_number.SourceFile); err != nil { + return + } + } + + // classify stack variables as members + for _, stack_variable := range function.StackVariables { + var token_member db.TokenMember + if stack_variable.Kind == "Parameter" || stack_variable.Kind == "ObjectPointer" { + token_member.Kind = db.ParameterMember + } else if stack_variable.Kind == "Local" { + token_member.Kind = db.LocalMember + } else if stack_variable.Kind == "StaticLocal" { + token_member.Kind = db.StaticLocalMember + } else if stack_variable.Kind == "Constant" { + // these are repeated elsewhere + continue + } else { + err = fmt.Errorf("unhandled stack variable kind '%s'", stack_variable.Kind) + return + } + + token_member.Key = stack_variable.Name + token_member.Value = stack_variable.Datatype + + token.Members = append(token.Members, token_member) + } + + return +} + +func (v *pdb_token_visitor) visit_enum(enum *pdbconv.Enum) (err error) { + // create hash of enum's contents and use to key the datatype + h := sha256.New() + h.Write([]byte(enum.Name)) + for _, member := range enum.Members { + h.Write([]byte(member.Name)) + h.Write([]byte(fmt.Sprintf("%d", member.Value))) + } + name := hex.EncodeToString(h.Sum(nil)) + + token, ok := v.datatypes[name] + if !ok { + token = new(db.Token) + v.datatypes[name] = token + } + + token.Source = v.pdb_source_id + + token.Keyword = "enum" + + // apply name (may be __unnamed) + token.Names = append(token.Names, db.TokenName{db.OriginalName, enum.Name}) + + // this is an original datatype + token.Kind = db.OriginalDatatypeToken + + for _, member := range enum.Members { + var token_member db.TokenMember + token_member.Kind = db.EnumMember + token_member.Key = member.Name + token_member.Value = fmt.Sprintf("%d", member.Value) + + if !slices.Contains(token.Members, token_member) { + token.Members = append(token.Members, token_member) + } + } + + return +} + +func (v *pdb_token_visitor) visit_string(s string) (err error) { + token, ok := v.strings[s] + if !ok { + token = new(db.Token) + v.strings[s] = token + } + + // apply source + token.Source = v.pdb_source_id + + // this is a string token + token.Kind = db.OriginalStringToken + + // add name + var token_name db.TokenName + token_name.Kind = db.OriginalName + token_name.Name = s + token.Names = []db.TokenName{token_name} + + return +} + +func (v *pdb_token_visitor) visit_source_files_table(table *pdbconv.Table) (err error) { + for _, source_file := range table.SourceFiles { + if err = v.visit_string(source_file.Name); err != nil { + return + } + } + + return +} + +func (v *pdb_token_visitor) visit_constant(symbol *pdbconv.TableSymbol) (err error) { + var token db.Token + token.ID = v.tokens_database.next_token_id() + token.Source = v.pdb_source_id + token.Keyword = "const" + token.Datatype = symbol.Datatype + + if symbol.Name != "" { + var name db.TokenName + name.Kind = db.OriginalName + name.Name = symbol.Name + token.Names = append(token.Names, name) + } + + if symbol.Undecorated != "" { + var name db.TokenName + name.Kind = db.OriginalName + name.Name = symbol.Undecorated + token.Names = append(token.Names, name) + } + + var value db.TokenMember + value.Kind = db.ConstantValueMember + value.Value = symbol.Value + + token.Members = append(token.Members, value) + + v.constants = append(v.constants, token) + return +} + +func (v *pdb_token_visitor) visit_table_symbol(symbol *pdbconv.TableSymbol) (err error) { + if symbol.Address == "0x0" { + if symbol.Value != "" && symbol.Kind == "Constant" { + err = v.visit_constant(symbol) + return + } + + // so, this does not correspond to an actual symbol. + // we can still mine it for string tokens. + if symbol.Name != "" { + if err = v.visit_string(symbol.Name); err != nil { + return + } + } + + if symbol.Undecorated != "" { + if err = v.visit_string(symbol.Name); err != nil { + return + } + } + + return + } + + // this corresponds to an address + // compute the real address + var address uint64 + address, err = strconv.ParseUint(symbol.Address, 0, 64) + if err != nil { + return + } + + address = v.base_address + address + + token, ok := v.symbols[address] + if !ok { + token = new(db.Token) + v.symbols[address] = token + } + token.Source = v.pdb_source_id + + if symbol.Datatype != "" { + token.Datatype = symbol.Datatype + } + + if symbol.Kind == "FileStatic" { + token.Keyword = "static" + } else if symbol.Kind == "Global" { + token.Keyword = "global" + } + + if symbol.Name != "" { + token_name := db.TokenName{db.OriginalName, symbol.Name} + if !slices.Contains(token.Names, token_name) { + token.Names = append(token.Names, token_name) + } + } + + if symbol.Undecorated != "" { + undecorated := db.TokenName{db.OriginalName, symbol.Undecorated} + if !slices.Contains(token.Names, undecorated) { + token.Names = append(token.Names, undecorated) + } + } + + return +} + +func (v *pdb_token_visitor) visit_symbols_table(table *pdbconv.Table) (err error) { + for _, symbol := range table.Symbols { + if err = v.visit_table_symbol(&symbol); err != nil { + return + } + } + + return +} + +func (v *pdb_token_visitor) visit_table(table *pdbconv.Table) (err error) { + if table.Name == "SourceFiles" { + err = v.visit_source_files_table(table) + } else if table.Name == "Symbols" { + err = v.visit_symbols_table(table) + } + return +} + +func (v *pdb_token_visitor) visit_typedef(typedef *pdbconv.Typedef) (err error) { + token, ok := v.datatypes[typedef.Name] + if !ok { + token = new(db.Token) + v.datatypes[typedef.Name] = token + } + + token.Source = v.pdb_source_id + + token.Kind = db.OriginalDatatypeToken + + var token_name db.TokenName + token_name.Kind = db.OriginalName + token_name.Name = typedef.Name + + if !slices.Contains(token.Names, token_name) { + token.Names = append(token.Names, token_name) + } + + token.Datatype = typedef.Basetype + + return +} + +func (v *pdb_token_visitor) visit_all(pdb *pdbconv.ProgramDatabase) (err error) { + for _, class := range pdb.Classes { + if err = v.visit_class(&class); err != nil { + return + } + } + + for _, datatype := range pdb.Datatypes { + if err = v.visit_datatype(&datatype); err != nil { + return + } + } + + for _, enum := range pdb.Enums { + if err = v.visit_enum(&enum); err != nil { + return + } + } + + for _, function := range pdb.Functions { + if err = v.visit_function(&function); err != nil { + return + } + } + + for _, table := range pdb.Tables { + if err = v.visit_table(&table); err != nil { + return + } + } + + for _, typedef := range pdb.Typedefs { + if err = v.visit_typedef(&typedef); err != nil { + return + } + } + + return +} + +func (v *pdb_token_visitor) write_tokens() (err error) { + datatypes := slices.Collect(maps.Keys(v.datatypes)) + sort.Strings(datatypes) + symbols := slices.Collect(maps.Keys(v.symbols)) + slices.SortFunc(symbols, func(a, b uint64) int { + if a < b { + return -1 + } else if a == b { + return 0 + } + return 1 + }) + strings := slices.Collect(maps.Keys(v.strings)) + sort.Strings(strings) + for _, datatype := range datatypes { + if err = v.tokens_database.Write(v.datatypes[datatype]); err != nil { + return + } + } + for _, symbol := range symbols { + if err = v.tokens_database.Write(v.symbols[symbol]); err != nil { + return + } + } + for _, string := range strings { + if err = v.tokens_database.Write(v.strings[string]); err != nil { + return + } + } + for _, constant := range v.constants { + if err = v.tokens_database.Write(&constant); err != nil { + return + } + } + return +} diff --git a/go/app/util/query.go b/go/app/util/query.go new file mode 100644 index 0000000..be08ec4 --- /dev/null +++ b/go/app/util/query.go @@ -0,0 +1,189 @@ +package util + +import ( + "encoding/json" + "fmt" + "os" + "regexp" + "slices" + + "github.com/parquet-go/parquet-go" + "github.com/thunderbrewhq/binana/go/app" + "github.com/thunderbrewhq/binana/go/db" +) + +type QueryPresentationMode uint8 + +const ( + PresentQueryNormal QueryPresentationMode = iota + PresentQueryNameOnly +) + +type QueryParams struct { + // + Present QueryPresentationMode + // Match pattern for profile + Profile string + // Possible values for Program + Program []string + // Possible values for OS + OS []string + // Possible values for arch + Arch []string + // Range of builds to return information for + MinBuild uint32 + MaxBuild uint32 + // Regular expression for tokens (symbols/type information) + Token string +} + +type token_query struct { + params *QueryParams + sample_database map[string]db.Sample + token_regexp *regexp.Regexp +} + +func (token_query *token_query) present_token(token *db.Token) { + if token_query.params.Present == PresentQueryNameOnly { + for _, name := range token.Names { + if token_query.token_regexp.MatchString(name.Name) { + fmt.Println(name.Name) + } + } + return + } + kind_name := "" + switch token.Kind { + case db.OriginalConstantToken: + kind_name = "original constant" + case db.OriginalDatatypeToken: + kind_name = "original datatype" + case db.OriginalStringToken: + kind_name = "original string" + case db.OriginalSymbolToken: + kind_name = "original symbol" + default: + return + } + fmt.Printf("%s in sample: '%s' section: '%s'", kind_name, token.Source[:8], token.Section) + if token.Offset != "" { + fmt.Printf(" at %s", token.Offset) + } + if token.Datatype != "" { + fmt.Printf(" with datatype: '%s'", token.Datatype) + } + fmt.Printf("\n") + fmt.Printf("names:\n") + + for _, name := range token.Names { + name_kind_name := "" + switch name.Kind { + case db.OriginalName: + name_kind_name = "original name" + case db.DemangledName: + name_kind_name = "demangled name" + case db.BinanaizedName: + name_kind_name = "binanaized name" + default: + panic(name.Kind) + } + + fmt.Printf("%s '%s'\n", name_kind_name, name.Name) + } + + fmt.Printf("--\n\n") +} + +// attempt to match token and report to stdout +// if returns quit = true, the search is halted +func (token_query *token_query) match_token(token *db.Token) (quit bool) { + matched := false + sample, ok := token_query.sample_database[token.Source] + if !ok { + m, err := json.Marshal(token) + if err != nil { + panic(err) + } + + fmt.Fprintln(os.Stderr, string(m)) + app.Fatal(fmt.Errorf("a token references a sample (%s) that does not exist in the sample database. please fix your database", token.Source)) + return + } + // filter out tokens from samples we don't care about + if sample.Build < token_query.params.MinBuild || sample.Build > token_query.params.MaxBuild { + return + } + if len(token_query.params.Program) > 0 { + if !slices.Contains(token_query.params.Program, sample.Program) { + return + } + } + if len(token_query.params.OS) > 0 { + if !slices.Contains(token_query.params.OS, sample.OS) { + return + } + } + if len(token_query.params.Arch) > 0 { + if !slices.Contains(token_query.params.Arch, sample.Arch) { + return + } + } + + for _, name := range token.Names { + if token_query.token_regexp.MatchString(name.Name) { + matched = true + break + } + } + if matched { + token_query.present_token(token) + } + return +} + +func (token_query *token_query) load_sample_database() (err error) { + token_query.sample_database = make(map[string]db.Sample) + + var samples []db.Sample + samples, err = parquet.ReadFile[db.Sample]("db/samples.parquet") + if err != nil { + return + } + + for _, sample := range samples { + token_query.sample_database[sample.ID] = sample + } + return +} + +func Query(params *QueryParams) { + var token_query token_query + token_query.params = params + token_query.token_regexp = regexp.MustCompilePOSIX(token_query.params.Token) + + if err := token_query.load_sample_database(); err != nil { + app.Fatal(err) + } + + tokens_db_file, err := os.Open("db/tokens.parquet") + if err != nil { + app.Fatal(err) + } + + rows := make([]db.Token, 1024) + reader := parquet.NewGenericReader[db.Token](tokens_db_file) +read_loop: + for { + n, err := reader.Read(rows) + if err != nil { + break + } + for _, token := range rows[:n] { + if token_query.match_token(&token) { + break read_loop + } + } + } + reader.Close() + tokens_db_file.Close() +} diff --git a/go/db/sample.go b/go/db/sample.go new file mode 100644 index 0000000..ff773c4 --- /dev/null +++ b/go/db/sample.go @@ -0,0 +1,47 @@ +package db + +type MirrorKind uint8 + +const ( + MirrorDirect MirrorKind = iota + MirrorIPFS +) + +type SampleMirror struct { + Kind MirrorKind `json:"kind" parquet:"kind"` + URL string `json:"url" parquet:"url,delta"` +} + +type Sample struct { + // The SHA-256 sum of the sample file + ID string `json:"id" parquet:"id,dict"` + + // if this is a debugging file, then this is a SHA-256 sum + // which references another sample file, which is the executable file + Executable string `json:"exe,omitempty" parquet:"exe,dict"` + + // This is the MIME type identifier of the sample file. + // Possible sample types include: + // * (Windows .exe) application/vnd.microsoft.portable-executable + // * (Mach-O binary) application/x-mach-binary + // * (Linux binary) application/x-elf + MimeType string `json:"mimetype" parquet:"mimetype,dict"` + + // This is the code that signifies which program the sample is a build of. + Program string `json:"program" parquet:"program,dict"` + + // This is the build sequence of the sample e.g. 12340 + Build uint32 `json:"build,omitempty" parquet:"build"` + + // This is the semantic version/release id of the sample e.g. 3.3.5a + Version string `json:"version,omitempty" parquet:"version"` + + // The OS of the sample, uses GOOS naming convention + OS string `json:"os" parquet:"os,dict"` + + // The architecture of the sample, uses GOARCH naming convention + Arch string `json:"arch" parquet:"arch,dict"` + + // A URL where the sample can be downloaded + Mirrors []SampleMirror `json:"mirrors,omitempty" parquet:"mirrors"` +} diff --git a/go/db/token.go b/go/db/token.go new file mode 100644 index 0000000..bc170eb --- /dev/null +++ b/go/db/token.go @@ -0,0 +1,79 @@ +package db + +type ( + TokenKind uint8 + TokenNameKind uint8 + TokenMemberKind uint8 +) + +const ( + // OriginalName means this string appeared verbatim in the original sample, and was not altered. + OriginalName TokenNameKind = iota + // DemangledName is provided in the case that the OriginalName was mangled by the compiler. + DemangledName + // Mangled names can be automatically Binanaized, i.e. converted into a naive syntax for wide + // compatibility with SRE tools + BinanaizedName +) + +const ( + // The token was obtained from a PDB or a Mach-O symtab + OriginalSymbolToken TokenKind = iota + // The token was found by scanning the non-executable sections of the binary for 0-terminated ASCII strings + OriginalStringToken + // The token is a datatype was obtained from a PDB or DWARF debugging file + OriginalDatatypeToken + // This token is a constant named value with no address + OriginalConstantToken +) + +const ( + ConstantValueMember TokenMemberKind = iota + EnumMember + // This is a part of a struct + // key = the field name + // value = the C type of the field + FieldMember + // This is a method of a class + MethodMember + // This in argument to a function + ParameterMember + // This is a local variable in a function + LocalMember + // This is a statically declared variable in a function + StaticLocalMember +) + +type TokenName struct { + Kind TokenNameKind `json:"kind" parquet:"kind"` + Name string `json:"name" parquet:"name,dict"` +} + +type TokenMember struct { + Kind TokenMemberKind `json:"kind" parquet:"kind"` + Key string `json:"key,omitempty" parquet:"key,dict"` + Value string `json:"value" parquet:"value,dict"` +} + +type Token struct { + // Unique 64-bit identifier + ID uint64 `json:"id" parquet:"id"` + // The SHA-256 hash id of the sample which generated the token + Source string `json:"src" parquet:"src,dict"` + // The color and subhead of the token + Kind TokenKind `json:"kind" parquet:"kind"` + // If this is a datatype, keyword tells you what kind of datatype it is. Useful when generating C code. + Keyword string `json:"keyword,omitempty" parquet:"keyword,dict"` + // If this is a global variable/constant, this tells you the data type + Datatype string `json:"datatype,omitempty" parquet:"datatype,dict"` + // The section where the token originated + Section string `json:"section,omitempty"` + // The offset (in hexadecimal) where the symbol + Offset string `json:"offset,omitempty"` + // Alternate names for the token + Names []TokenName `json:"names"` + // Clickable references to other tokens + Highlights []string `json:"crumbs,omitempty"` + // Struct/Enum members + Members []TokenMember `json:"members,omitempty"` +} diff --git a/go/pdbconv/db.go b/go/pdbconv/db.go new file mode 100644 index 0000000..d8e1e77 --- /dev/null +++ b/go/pdbconv/db.go @@ -0,0 +1,109 @@ +package pdbconv + +type ClassMember struct { + Datatype string `json:"datatype,omitempty"` + Kind string `json:"kind,omitempty"` + Length uint64 `json:"length,omitempty"` + Name string `json:"name,omitempty"` + Offset uint64 `json:"offset,omitempty"` +} + +type Class struct { + Length string `json:"length,omitempty"` + Name string `json:"name,omitempty"` + Members []ClassMember `json:"member,omitempty"` +} + +type DatatypeMember struct { + Datatype string `json:"datatype,omitempty"` + Kind string `json:"kind,omitempty"` + Length uint64 `json:"length,omitempty"` + Name string `json:"name,omitempty"` + Offset uint64 `json:"offset,omitempty"` +} + +type Datatype struct { + Kind string `json:"kind,omitempty"` + Length string `json:"length,omitempty"` + Name string `json:"name,omitempty"` + Members []DatatypeMember `json:"member,omitempty"` +} + +type EnumMember struct { + Name string `json:"name,omitempty"` + Value int `json:"value,omitempty"` +} + +type Enum struct { + Length uint64 `json:"length,omitempty"` + Name string `json:"name,omitempty"` + Type string `json:"type,omitempty"` + Members []EnumMember `json:"member,omitempty"` +} + +type FunctionLineNumber struct { + Address string `json:"addr,omitempty"` + End int `json:"end,omitempty"` + Length int `json:"length,omitempty"` + SourceFile string `json:"source_file,omitempty"` + Start int `json:"start,omitempty"` +} + +type FunctionStackVariable struct { + Datatype string `json:"datatype,omitempty"` + Kind string `json:"kind,omitempty"` + Length uint64 `json:"length,omitempty"` + Name string `json:"name,omitempty"` + Offset uint64 `json:"offset,omitempty"` +} + +type Function struct { + Address string `json:"address,omitempty"` + Length uint64 `json:"length,omitempty"` + Name string `json:"name,omitempty"` + LineNumbers []FunctionLineNumber `json:"line_numbers,omitempty"` + StackVariables []FunctionStackVariable `json:"stack_variables,omitempty"` +} + +type TableSegment struct { + Address string `json:"address,omitempty"` + Number int `json:"number,omitempty"` +} + +type TableSourceFile struct { + ID string `json:"id,omitempty"` + Name string `json:"name,omitempty"` +} + +type TableSymbol struct { + Address string `json:"address,omitempty"` + Datatype string `json:"datatype,omitempty"` + Index uint64 `json:"index,omitempty"` + Kind string `json:"kind,omitempty"` + Length uint64 `json:"length,omitempty"` + Name string `json:"name,omitempty"` + Tag string `json:"tag,omitempty"` + Undecorated string `json:"undecorated,omitempty"` + Value string `json:"value,omitempty"` +} + +type Table struct { + Name string `json:"name,omitempty"` + Segments []TableSegment `json:"segments,omitempty"` + SourceFiles []TableSourceFile `json:"source_files,omitempty"` + Symbols []TableSymbol `json:"symbols,omitempty"` +} + +type Typedef struct { + Basetype string `json:"basetype,omitempty"` + Name string `json:"name,omitempty"` +} + +type ProgramDatabase struct { + Classes []Class `json:"classes,omitempty"` + Datatypes []Datatype `json:"datatypes,omitempty"` + Enums []Enum `json:"enums,omitempty"` + Functions []Function `json:"functions,omitempty"` + Tables []Table `json:"tables,omitempty"` + Typedefs []Typedef `json:"typedefs,omitempty"` +} diff --git a/go/pdbconv/xml.go b/go/pdbconv/xml.go new file mode 100644 index 0000000..b0699c7 --- /dev/null +++ b/go/pdbconv/xml.go @@ -0,0 +1,93 @@ +package pdbconv + +type GhidraXml struct { + Classes struct { + Class []struct { + Length string `xml:"length,attr"` + Name string `xml:"name,attr"` + Member []struct { + Datatype string `xml:"datatype,attr"` + Kind string `xml:"kind,attr"` + Length string `xml:"length,attr"` + Name string `xml:"name,attr"` + Offset string `xml:"offset,attr"` + } `xml:"member"` + } `xml:"class"` + } `xml:"classes"` + Datatypes struct { + Datatype []struct { + Kind string `xml:"kind,attr"` + Length string `xml:"length,attr"` + Name string `xml:"name,attr"` + Member []struct { + Datatype string `xml:"datatype,attr"` + Kind string `xml:"kind,attr"` + Length string `xml:"length,attr"` + Name string `xml:"name,attr"` + Offset string `xml:"offset,attr"` + } `xml:"member"` + } `xml:"datatype"` + } `xml:"datatypes"` + Enums struct { + Enum []struct { + Length string `xml:"length,attr"` + Name string `xml:"name,attr"` + Type string `xml:"type,attr"` + Member []struct { + Name string `xml:"name,attr"` + Value int `xml:"value,attr"` + } `xml:"member"` + } `xml:"enum"` + } `xml:"enums"` + Functions struct { + Function []struct { + Address string `xml:"address,attr"` + Length string `xml:"length,attr"` + Name string `xml:"name,attr"` + LineNumber []struct { + Addr string `xml:"addr,attr"` + End int `xml:"end,attr"` + Length int `xml:"length,attr"` + SourceFile string `xml:"source_file,attr"` + Start int `xml:"start,attr"` + } `xml:"line_number"` + StackVariable []struct { + Datatype string `xml:"datatype,attr"` + Kind string `xml:"kind,attr"` + Length string `xml:"length,attr"` + Name string `xml:"name,attr"` + Offset string `xml:"offset,attr"` + } `xml:"stack_variable"` + } `xml:"function"` + } `xml:"functions"` + Tables struct { + Table []struct { + Name string `xml:"name,attr"` + Segment []struct { + Address string `xml:"address,attr"` + Number int `xml:"number,attr"` + } `xml:"segment"` + SourceFile []struct { + ID string `xml:"id,attr"` + Name string `xml:"name,attr"` + } `xml:"source_file"` + Symbol []struct { + Address string `xml:"address,attr"` + Datatype string `xml:"datatype,attr"` + Index string `xml:"index,attr"` + Kind string `xml:"kind,attr"` + Length string `xml:"length,attr"` + Name string `xml:"name,attr"` + Tag string `xml:"tag,attr"` + Undecorated string `xml:"undecorated,attr"` + Value string `xml:"value,attr"` + } `xml:"symbol"` + } `xml:"table"` + } `xml:"tables"` + Typedefs struct { + Typedef []struct { + Basetype string `xml:"basetype,attr"` + Name string `xml:"name,attr"` + } `xml:"typedef"` + } `xml:"typedefs"` +} diff --git a/go/stringrecovery/stringrecovery.go b/go/stringrecovery/stringrecovery.go new file mode 100644 index 0000000..d9841fa --- /dev/null +++ b/go/stringrecovery/stringrecovery.go @@ -0,0 +1,174 @@ +package stringrecovery + +import ( + "debug/macho" + "debug/pe" + "encoding/binary" + "errors" + "fmt" + "io" + "os" + "strings" +) + +var ( + charset_english = ` !"#$%&\'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_abcdefghijklmnopqrstuvwxyz{|}~` + "`" + lookup_table_english [256]bool +) + +type ( + Callback func(segment_name string, address uint64, token string) +) + +func init() { + for _, c := range charset_english { + lookup_table_english[c] = true + } +} + +func recover_section(try_align bool, word_size uint64, minimum_length int, section_name string, virtual_address uint64, section_reader io.ReaderAt, callback Callback) (err error) { + var ( + offset int64 + ) + + var ( + current_token strings.Builder + current_token_offset int64 + ) + + for { + var b [1]byte + if _, err = section_reader.ReadAt(b[:], offset); err != nil { + if err == io.EOF { + err = nil + break + } + } + if b[0] == 0 { + // if current_token != "", this is a 0-terminator + // emit the token + if current_token.Len() > 0 { + if current_token.Len() < minimum_length { + current_token.Reset() + offset++ + continue + } + + align_offset := 0 + + if try_align { + for i := uint64(current_token_offset); (i % word_size) != 0; i++ { + current_token_offset++ + align_offset++ + } + } + + current_token_string := current_token.String() + current_token_string = current_token_string[align_offset:] + + callback(section_name, virtual_address+uint64(current_token_offset), current_token_string) + current_token.Reset() + } + offset++ + continue + } + if lookup_table_english[b[0]] { + if current_token.Len() == 0 { + current_token_offset = offset + } + current_token.WriteByte(b[0]) + } else { + // discard everything leaing up to this + current_token.Reset() + } + offset++ + } + + return +} + +func recover_file_macho(word_size uint64, file *os.File, callback Callback) (err error) { + var ( + macho_file *macho.File + ) + macho_file, err = macho.NewFile(file) + if err != nil { + return + } + for _, section := range macho_file.Sections { + fmt.Fprintln(os.Stderr, "recovering", section.Name) + switch section.Name { + case "__cstring": + if err = recover_section(false, word_size, 1, section.Name, section.Addr, section, callback); err != nil { + return + } + case "__const": + if err = recover_section(false, word_size, 4, section.Name, section.Addr, section, callback); err != nil { + return + } + } + } + return +} + +func recover_file_pe(file *os.File, callback Callback) (err error) { + var ( + pe_file *pe.File + ) + pe_file, err = pe.NewFile(file) + if err != nil { + return + } + image_base := uint64(0x400000) + var word_size uint64 + + switch h := pe_file.OptionalHeader.(type) { + case *pe.OptionalHeader32: + word_size = 4 + image_base = uint64(h.ImageBase) + case *pe.OptionalHeader64: + word_size = 8 + image_base = h.ImageBase + } + + for _, section := range pe_file.Sections { + fmt.Fprintln(os.Stderr, "recovering", section.Name) + switch section.Name { + case ".data": + if err = recover_section(true, word_size, 4, section.Name, image_base+uint64(section.VirtualAddress), section, callback); err != nil { + return + } + case ".rdata": + if err = recover_section(true, word_size, 4, section.Name, image_base+uint64(section.VirtualAddress), section, callback); err != nil { + return + } + } + } + + return +} + +func RecoverFile(filename string, callback Callback) (err error) { + var file *os.File + file, err = os.Open(filename) + if err != nil { + return + } + var magic [4]byte + if _, err = file.ReadAt(magic[:], 0); err != nil { + return + } + magic_number := binary.LittleEndian.Uint32(magic[:]) + if magic[0] == 'M' && magic[1] == 'Z' { + err = recover_file_pe(file, callback) + } else if magic_number == 0xfeedface { + err = recover_file_macho(4, file, callback) + } else if magic_number == 0xfeedfacf { + err = recover_file_macho(8, file, callback) + } else if magic_number == 0xcefaedfe { + err = recover_file_macho(4, file, callback) + } else { + err = errors.New("unknown file magic: " + filename) + } + return +}